Date: (Fri) Jan 15, 2016
Data: Source: Training: https://www.kaggle.com/c/facial-keypoints-detection/download/training.zip
New: https://www.kaggle.com/c/facial-keypoints-detection/download/test.zip
Time period:
Based on analysis utilizing <> techniques,
Summary of key steps & error improvement stats:
Use plot.ly for interactive plots ?
varImp for randomForest crashes in caret version:6.0.41 -> submit bug report
extensions toward multiclass classification are scheduled for the next release
rm(list = ls())
set.seed(12345)
options(stringsAsFactors = FALSE)
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mydsutils.R")
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
source("~/Dropbox/datascience/R/myplclust.R")
source("~/Dropbox/datascience/R/mytm.R")
# Gather all package requirements here
suppressPackageStartupMessages(require(doMC))
glbCores <- 6 # of cores on machine - 2
registerDoMC(glbCores)
suppressPackageStartupMessages(require(caret))
require(plyr)
## Loading required package: plyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(knitr)
## Loading required package: knitr
#source("dbgcaret.R")
#packageVersion("snow")
#require(sos); findFn("cosine", maxPages=2, sortby="MaxScore")
# Analysis control global variables
# Inputs
# url/name = "<pointer>"; if url specifies a zip file, name = "<filename>"
# sep = choose from c(NULL, "\t")
glbObsTrnFile <- list(url = "https://www.kaggle.com/c/facial-keypoints-detection/download/training.zip",
name = "training/training.csv")
glbObsNewFile <- list(url = "https://www.kaggle.com/c/facial-keypoints-detection/download/test.zip",
name = "test/test.csv") # default OR
#list(splitSpecs = list(method = NULL #select from c(NULL, "condition", "sample", "copy")
# ,nRatio = 0.3 # > 0 && < 1 if method == "sample"
# ,seed = 123 # any integer or glbObsTrnPartitionSeed if method == "sample"
# ,condition = # or 'is.na(<var>)'; '<var> <condition_operator> <value>'
# )
# )
glbInpMerge <- NULL #: default
# list(fnames = c("<fname1>", "<fname2>")) # files will be concatenated
glb_is_separate_newobs_dataset <- TRUE # or TRUE
glb_split_entity_newobs_datasets <- TRUE # FALSE not supported - use "copy" for glbObsNewFile$splitSpecs$method # select from c(FALSE, TRUE)
glbObsDropCondition <- NULL # : default
# enclose in single-quotes b/c condition might include double qoutes
# use | & ; NOT || &&
# '<condition>'
# 'grepl("^First Draft Video:", glbObsAll$Headline)'
# '(is.na(glbObsAll[, glb_rsp_var_raw]) & grepl("Train", glbObsAll[, glbFeatsId]))'
#nrow(do.call("subset",list(glbObsAll, parse(text=paste0("!(", glbObsDropCondition, ")")))))
glb_obs_repartition_train_condition <- NULL # : default
# "<condition>"
glb_max_fitobs <- NULL # or any integer
glbObsTrnPartitionSeed <- 123 # or any integer
glb_is_regression <- TRUE; glb_is_classification <- !glb_is_regression;
glb_is_binomial <- NULL # or TRUE or FALSE
glb_rsp_var_raw <- ".pos.y"
# for classification, the response variable has to be a factor
glb_rsp_var <- glb_rsp_var_raw # or "left_eye_center_x.fctr"
# if the response factor is based on numbers/logicals e.g (0/1 OR TRUE/FALSE vs. "A"/"B"),
# or contains spaces (e.g. "Not in Labor Force")
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- NULL
# function(raw) {
# return(raw ^ 0.5)
# return(log(raw))
# return(log(1 + raw))
# return(log10(raw))
# return(exp(-raw / 2))
# ret_vals <- rep_len(NA, length(raw)); ret_vals[!is.na(raw)] <- ifelse(raw[!is.na(raw)] == 1, "Y", "N"); return(relevel(as.factor(ret_vals), ref="N"))
# as.factor(paste0("B", raw))
# as.factor(gsub(" ", "\\.", raw))
# }
#if glb_rsp_var_raw is numeric:
#print(summary(glbObsAll[, glb_rsp_var_raw]))
#glb_map_rsp_raw_to_var(tst <- c(NA, as.numeric(summary(glbObsAll[, glb_rsp_var_raw]))))
#if glb_rsp_var_raw is character:
#print(table(glbObsAll[, glb_rsp_var_raw]))
#glb_map_rsp_raw_to_var(tst <- c(NA, names(table(glbObsAll[, glb_rsp_var_raw]))))
glb_map_rsp_var_to_raw <- NULL
# function(var) {
# return(var ^ 2.0)
# return(exp(var))
# return(10 ^ var)
# return(-log(var) * 2)
# as.numeric(var)
# gsub("\\.", " ", levels(var)[as.numeric(var)])
# c("<=50K", " >50K")[as.numeric(var)]
# c(FALSE, TRUE)[as.numeric(var)]
# }
# glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(tst))
if ((glb_rsp_var != glb_rsp_var_raw) && is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
# List info gathered for various columns
# <col_name>: <description>; <notes>
# currently does not handle more than 1 column; consider concatenating multiple columns
# If glbFeatsId == NULL, ".rownames <- as.numeric(row.names())" is the default
glbFeatsId <- "ImageId" # choose from c(NULL : default, "<id_feat>")
glbFeatsCategory <- "Image.pxl.1.dgt.1" # choose from c(NULL : default, "<category_feat>")
# User-specified exclusions
glbFeatsExcludeLcl <- c(NULL
# Required outputs
,"left_eye_center_x", "left_eye_center_y"
,"right_eye_center_x", "right_eye_center_y"
,"left_eye_inner_corner_x", "left_eye_inner_corner_y"
,"left_eye_outer_corner_x", "left_eye_outer_corner_y"
,"right_eye_inner_corner_x", "right_eye_inner_corner_y"
,"right_eye_outer_corner_x", "right_eye_outer_corner_y"
,"left_eyebrow_inner_end_x", "left_eyebrow_inner_end_y"
,"left_eyebrow_outer_end_x", "left_eyebrow_outer_end_y"
,"right_eyebrow_inner_end_x", "right_eyebrow_inner_end_y"
,"right_eyebrow_outer_end_x", "right_eyebrow_outer_end_y"
,"nose_tip_x", "nose_tip_y"
,"mouth_left_corner_x", "mouth_left_corner_y"
,"mouth_right_corner_x", "mouth_right_corner_y"
,"mouth_center_top_lip_x", "mouth_center_top_lip_y"
,"mouth_center_bottom_lip_x", "mouth_center_bottom_lip_y"
)
glbFeatsExclude <- c(NULL
# Feats that shd be excluded due to known causation by prediction variable
# , "<feat1", "<feat2>"
# Feats that are linear combinations (alias in glm)
# Feature-engineering phase -> start by excluding all features except id & category & work each one in
,glbFeatsExcludeLcl
,"Image.pxl.1.dgt.1"
)
if (glb_rsp_var_raw != glb_rsp_var)
glbFeatsExclude <- union(glbFeatsExclude, glb_rsp_var_raw)
glbFeatsInteractionOnly <- list()
#glbFeatsInteractionOnly[["<child_feat>"]] <- "<parent_feat>"
glbFeatsDrop <- c(NULL
# , "<feat1>", "<feat2>"
)
glb_map_vars <- NULL # or c("<var1>", "<var2>")
glb_map_urls <- list();
# glb_map_urls[["<var1>"]] <- "<var1.url>"
glb_assign_pairs_lst <- NULL;
# glb_assign_pairs_lst[["<var1>"]] <- list(from=c(NA),
# to=c("NA.my"))
glb_assign_vars <- names(glb_assign_pairs_lst)
# Derived features; Use this mechanism to cleanse data ??? Cons: Data duplication ???
glbFeatsDerive <- list();
# glbFeatsDerive[["<feat.my.sfx>"]] <- list(
# mapfn = function(<arg1>, <arg2>) { return(function(<arg1>, <arg2>)) }
# , args = c("<arg1>", "<arg2>"))
#myprint_df(data.frame(ImageId = mapfn(glbObsAll$.src, glbObsAll$.pos)))
#data.frame(ImageId = mapfn(glbObsAll$.src, glbObsAll$.pos))[7045:7055, ]
# character
# mapfn = function(Week) { return(substr(Week, 1, 10)) }
# mapfn = function(descriptor) { return(plyr::revalue(descriptor, c(
# "ABANDONED BUILDING" = "OTHER",
# "**" = "**"
# ))) }
# mapfn = function(description) { mod_raw <- description;
# This is here because it does not work if it's in txt_map_filename
# mod_raw <- gsub(paste0(c("\n", "\211", "\235", "\317", "\333"), collapse = "|"), " ", mod_raw)
# Don't parse for "." because of ".com"; use customized gsub for that text
# mod_raw <- gsub("(\\w)(!|\\*|,|-|/)(\\w)", "\\1\\2 \\3", mod_raw);
# Some state acrnoyms need context for separation e.g.
# LA/L.A. could either be "Louisiana" or "LosAngeles"
# modRaw <- gsub("\\bL\\.A\\.( |,|')", "LosAngeles\\1", modRaw);
# OK/O.K. could either be "Oklahoma" or "Okay"
# modRaw <- gsub("\\bACA OK\\b", "ACA OKay", modRaw);
# modRaw <- gsub("\\bNow O\\.K\\.\\b", "Now OKay", modRaw);
# PR/P.R. could either be "PuertoRico" or "Public Relations"
# modRaw <- gsub("\\bP\\.R\\. Campaign", "PublicRelations Campaign", modRaw);
# VA/V.A. could either be "Virginia" or "VeteransAdministration"
# modRaw <- gsub("\\bthe V\\.A\\.\\:", "the VeteranAffairs:", modRaw);
#
# Custom mods
# return(mod_raw) }
# numeric
# Create feature based on record position/id in data
glbFeatsDerive[[".pos"]] <- list(
mapfn = function(.rnorm) { return(1:length(.rnorm)) }
, args = c(".rnorm"))
glbFeatsDerive[[".pos.y"]] <- list(
mapfn = function(.rnorm) { return(1:length(.rnorm)) }
, args = c(".rnorm"))
glbFeatsDerive[["ImageId"]] <- list(
mapfn = function(.src, .pos) {
# return(paste(.src, sprintf("%04d", .pos), sep = "#"))
return(paste(.src, sprintf("%04d",
ifelse(.src == "Train", .pos, .pos - 7049)
), sep = "#"))
}
, args = c(".src", ".pos"))
glbFeatsDerive[["left_eye_center_x"]] <- list(
mapfn = function(left_eye_center_x) { return(as.integer(left_eye_center_x)) }
, args = c("left_eye_center_x"))
glbFeatsDerive[["left_eye_center_y"]] <- list(
mapfn = function(left_eye_center_y) { return(as.integer(left_eye_center_y)) }
, args = c("left_eye_center_y"))
#myprint_df(data.frame(ImageId = mapfn(glbObsAll$.src, glbObsAll$.pos)))
#data.frame(ImageId = mapfn(glbObsAll$.src, glbObsAll$.pos))[7045:7055, ]
glbFeatsDerive[["Image.pxl.1.dgt.1"]] <- list(
# mapfn = function(Image) { return(cut(as.integer(sapply(Image, function(img) strsplit(img, " ")[[1]][1])),
# breaks = 5)) }
mapfn = function(Image) { return(substr(Image, 1, 1)) }
, args = c("Image"))
# Add logs of numerics that are not distributed normally
# Derive & keep multiple transformations of the same feature, if normality is hard to achieve with just one transformation
# Right skew: logp1; sqrt; ^ 1/3; logp1(logp1); log10; exp(-<feat>/constant)
# glbFeatsDerive[["WordCount.log1p"]] <- list(
# mapfn = function(WordCount) { return(log1p(WordCount)) }
# , args = c("WordCount"))
# glbFeatsDerive[["WordCount.root2"]] <- list(
# mapfn = function(WordCount) { return(WordCount ^ (1/2)) }
# , args = c("WordCount"))
# glbFeatsDerive[["WordCount.nexp"]] <- list(
# mapfn = function(WordCount) { return(exp(-WordCount)) }
# , args = c("WordCount"))
#print(summary(glbObsAll$WordCount))
#print(summary(mapfn(glbObsAll$WordCount)))
# mapfn = function(HOSPI.COST) { return(cut(HOSPI.COST, 5, breaks = c(0, 100000, 200000, 300000, 900000), labels = NULL)) }
# mapfn = function(Rasmussen) { return(ifelse(sign(Rasmussen) >= 0, 1, 0)) }
# mapfn = function(startprice) { return(startprice ^ (1/2)) }
# mapfn = function(startprice) { return(log(startprice)) }
# mapfn = function(startprice) { return(exp(-startprice / 20)) }
# mapfn = function(startprice) { return(scale(log(startprice))) }
# mapfn = function(startprice) { return(sign(sprice.predict.diff) * (abs(sprice.predict.diff) ^ (1/10))) }
# factor
# mapfn = function(PropR) { return(as.factor(ifelse(PropR >= 0.5, "Y", "N"))) }
# mapfn = function(productline, description) { as.factor(gsub(" ", "", productline)) }
# mapfn = function(purpose) { return(relevel(as.factor(purpose), ref="all_other")) }
# mapfn = function(raw) { tfr_raw <- as.character(cut(raw, 5));
# tfr_raw[is.na(tfr_raw)] <- "NA.my";
# return(as.factor(tfr_raw)) }
# mapfn = function(startprice.log10) { return(cut(startprice.log10, 3)) }
# mapfn = function(startprice.log10) { return(cut(sprice.predict.diff, c(-1000, -100, -10, -1, 0, 1, 10, 100, 1000))) }
# , args = c("<arg1>"))
# multiple args
# mapfn = function(id, date) { return(paste(as.character(id), as.character(date), sep = "#")) }
# mapfn = function(PTS, oppPTS) { return(PTS - oppPTS) }
# mapfn = function(startprice.log10.predict, startprice) {
# return(spdiff <- (10 ^ startprice.log10.predict) - startprice) }
# mapfn = function(productline, description) { as.factor(
# paste(gsub(" ", "", productline), as.numeric(nchar(description) > 0), sep = "*")) }
# mapfn = function(.src, .pos) {
# return(paste(.src, sprintf("%04d",
# ifelse(.src == "Train", .pos, .pos - 7049)
# ), sep = "#")) }
# # If glbObsAll is not sorted in the desired manner
# mapfn=function(Week) { return(coredata(lag(zoo(orderBy(~Week, glbObsAll)$ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI) { return(coredata(lag(zoo(ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI.2.lag) { return(log(ILI.2.lag)) }
# glbFeatsDerive[["<var1>"]] <- glbFeatsDerive[["<var2>"]]
glb_derive_vars <- names(glbFeatsDerive)
# tst <- "descr.my"; args_lst <- NULL; for (arg in glbFeatsDerive[[tst]]$args) args_lst[[arg]] <- glbObsAll[, arg]; print(head(args_lst[[arg]])); print(head(drv_vals <- do.call(glbFeatsDerive[[tst]]$mapfn, args_lst)));
# print(which_ix <- which(args_lst[[arg]] == 0.75)); print(drv_vals[which_ix]);
glbFeatsDateTime <- list()
# glbFeatsDateTime[["<DateTimeFeat>"]] <-
# c(format = "%Y-%m-%d %H:%M:%S", timezone = "America/New_York", impute.na = TRUE,
# last.ctg = TRUE, poly.ctg = TRUE)
glbFeatsPrice <- NULL # or c("<price_var>")
glbFeatsImage <- list(Image = list(patchSize = 10)) # if patchSize not specified, no patch computation
glbFeatsText <- list()
Sys.setlocale("LC_ALL", "C") # For english
## [1] "C/C/C/C/C/en_US.UTF-8"
#glbFeatsText[["<TextFeature>"]] <- list(NULL,
# ,names = myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL,
# <comma-separated-screened-names>
# ))))
# ,rareWords = myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL,
# <comma-separated-nonSCOWL-words>
# ))))
#)
# Text Processing Step: custom modifications not present in txt_munge -> use glbFeatsDerive
# Text Processing Step: universal modifications
glb_txt_munge_filenames_pfx <- "<projectId>_mytxt_"
# Text Processing Step: tolower
# Text Processing Step: myreplacePunctuation
# Text Processing Step: removeWords
glb_txt_stop_words <- list()
# Remember to use unstemmed words
if (length(glbFeatsText) > 0) {
require(tm)
require(stringr)
glb_txt_stop_words[["<txt_var>"]] <- sort(myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL
# Remove any words from stopwords
# , setdiff(myreplacePunctuation(stopwords("english")), c("<keep_wrd1>", <keep_wrd2>"))
# Remove salutations
,"mr","mrs","dr","Rev"
# Remove misc
#,"th" # Happy [[:digit::]]+th birthday
# Remove terms present in Trn only or New only; search for "Partition post-stem"
# ,<comma-separated-terms>
# cor.y.train == NA
# ,unlist(strsplit(paste(c(NULL
# ,"<comma-separated-terms>"
# ), collapse=",")
# freq == 1; keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# chisq.pval high (e.g. == 1); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# nzv.freqRatio high (e.g. >= glbFeatsNzvFreqMax); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
)))))
}
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^man", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txtFeat]][, 4866] > 0, c(glb_rsp_var, txtFeat)]
# To identify terms with a specific freq
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txtFeat]], freq == 1)$term), collapse = ",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], freq <= 2)$term), collapse = ",")
#subset(glb_post_stem_words_terms_df_lst[[txtFeat]], term %in% c("zinger"))
# To identify terms with a specific freq &
# are not stemmed together later OR is value of color.fctr (e.g. gold)
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txtFeat]], (freq == 1) & !(term %in% c("blacked","blemish","blocked","blocks","buying","cables","careful","carefully","changed","changing","chargers","cleanly","cleared","connect","connects","connected","contains","cosmetics","default","defaulting","defective","definitely","describe","described","devices","displays","drop","drops","engravement","excellant","excellently","feels","fix","flawlessly","frame","framing","gentle","gold","guarantee","guarantees","handled","handling","having","install","iphone","iphones","keeped","keeps","known","lights","line","lining","liquid","liquidation","looking","lots","manuals","manufacture","minis","most","mostly","network","networks","noted","opening","operated","performance","performs","person","personalized","photograph","physically","placed","places","powering","pre","previously","products","protection","purchasing","returned","rotate","rotation","running","sales","second","seconds","shipped","shuts","sides","skin","skinned","sticker","storing","thats","theres","touching","unusable","update","updates","upgrade","weeks","wrapped","verified","verify") ))$term), collapse = ",")
#print(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (freq <= 2)))
#glbObsAll[which(terms_mtrx[, 229] > 0), glbFeatsText]
# To identify terms with cor.y == NA
#orderBy(~-freq+term, subset(glb_post_stop_words_terms_df_lst[[txtFeat]], is.na(cor.y)))
#paste(sort(subset(glb_post_stop_words_terms_df_lst[[txtFeat]], is.na(cor.y))[, "term"]), collapse=",")
#orderBy(~-freq+term, subset(glb_post_stem_words_terms_df_lst[[txtFeat]], is.na(cor.y)))
# To identify terms with low cor.y.abs
#head(orderBy(~cor.y.abs+freq+term, subset(glb_post_stem_words_terms_df_lst[[txtFeat]], !is.na(cor.y))), 5)
# To identify terms with high chisq.pval
#subset(glb_post_stem_words_terms_df_lst[[txtFeat]], chisq.pval > 0.99)
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (chisq.pval > 0.99) & (freq <= 10))$term), collapse=",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (chisq.pval > 0.9))$term), collapse=",")
#head(orderBy(~-chisq.pval+freq+term, glb_post_stem_words_terms_df_lst[[txtFeat]]), 5)
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txtFeat]][, 68] > 0, glbFeatsText]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^m", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
# To identify terms with high nzv.freqRatio
#summary(glb_post_stem_words_terms_df_lst[[txtFeat]]$nzv.freqRatio)
#paste0(sort(setdiff(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (nzv.freqRatio >= glbFeatsNzvFreqMax) & (freq < 10) & (chisq.pval >= 0.05))$term, c( "128gb","3g","4g","gold","ipad1","ipad3","ipad4","ipadair2","ipadmini2","manufactur","spacegray","sprint","tmobil","verizon","wifion"))), collapse=",")
# To identify obs with a txt term
#tail(orderBy(~-freq+term, glb_post_stop_words_terms_df_lst[[txtFeat]]), 20)
#mydspObs(list(descr.my.contains="non"), cols=c("color", "carrier", "cellular", "storage"))
#grep("ever", dimnames(terms_stop_mtrx)$Terms)
#which(terms_stop_mtrx[, grep("ipad", dimnames(terms_stop_mtrx)$Terms)] > 0)
#glbObsAll[which(terms_stop_mtrx[, grep("16", dimnames(terms_stop_mtrx)$Terms)[1]] > 0), c(glbFeatsCategory, "storage", txtFeat)]
# Text Processing Step: screen for names # Move to glbFeatsText specs section in order of text processing steps
# glbFeatsText[["<txtFeat>"]]$names <- myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL
# # Person names for names screening
# ,<comma-separated-list>
#
# # Company names
# ,<comma-separated-list>
#
# # Product names
# ,<comma-separated-list>
# ))))
# glbFeatsText[["<txtFeat>"]]$rareWords <- myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL
# # Words not in SCOWL db
# ,<comma-separated-list>
# ))))
# To identify char vectors post glbFeatsTextMap
#grep("six(.*)hour", glb_txt_chr_lst[[txtFeat]], ignore.case = TRUE, value = TRUE)
#grep("[S|s]ix(.*)[H|h]our", glb_txt_chr_lst[[txtFeat]], value = TRUE)
# To identify whether terms shd be synonyms
#orderBy(~term, glb_post_stop_words_terms_df_lst[[txtFeat]][grep("^moder", glb_post_stop_words_terms_df_lst[[txtFeat]]$term), ])
# term_row_df <- glb_post_stop_words_terms_df_lst[[txtFeat]][grep("^came$", glb_post_stop_words_terms_df_lst[[txtFeat]]$term), ]
#
# cor(glb_post_stop_words_terms_mtrx_lst[[txtFeat]][glbObsAll$.lcn == "Fit", term_row_df$pos], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# To identify which stopped words are "close" to a txt term
#sort(cluster_vars)
# Text Processing Step: stemDocument
# To identify stemmed txt terms
#glb_post_stop_words_terms_df_lst[[txtFeat]][grep("^la$", glb_post_stop_words_terms_df_lst[[txtFeat]]$term), ]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^con", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
#glbObsAll[which(terms_stem_mtrx[, grep("use", dimnames(terms_stem_mtrx)$Terms)[[1]]] > 0), c(glbFeatsId, "productline", txtFeat)]
#glbObsAll[which(TfIdf_stem_mtrx[, 191] > 0), c(glbFeatsId, glbFeatsCategory, txtFeat)]
#glbObsAll[which(glb_post_stop_words_terms_mtrx_lst[[txtFeat]][, 6165] > 0), c(glbFeatsId, glbFeatsCategory, txtFeat)]
#which(glbObsAll$UniqueID %in% c(11915, 11926, 12198))
# Text Processing Step: mycombineSynonyms
# To identify which terms are associated with not -> combine "could not" & "couldn't"
#findAssocs(glb_full_DTM_lst[[txtFeat]], "not", 0.05)
# To identify which synonyms should be combined
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^c", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
chk_comb_cor <- function(syn_lst) {
# cor(terms_stem_mtrx[glbObsAll$.src == "Train", grep("^(damag|dent|ding)$", dimnames(terms_stem_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
print(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], term %in% syn_lst$syns))
print(subset(get_corpus_terms(tm_map(glbFeatsTextCorpus[[txtFeat]], mycombineSynonyms, list(syn_lst), lazy=FALSE)), term == syn_lst$word))
# cor(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# cor(rowSums(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])]), glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
}
#chk_comb_cor(syn_lst=list(word="cabl", syns=c("cabl", "cord")))
#chk_comb_cor(syn_lst=list(word="damag", syns=c("damag", "dent", "ding")))
#chk_comb_cor(syn_lst=list(word="dent", syns=c("dent", "ding")))
#chk_comb_cor(syn_lst=list(word="use", syns=c("use", "usag")))
glbFeatsTextSynonyms <- list()
# list parsed to collect glbFeatsText[[<txtFeat>]]$vldTerms
# glbFeatsTextSynonyms[["Hdln.my"]] <- list(NULL
# # people in places
# , list(word = "australia", syns = c("australia", "australian"))
# , list(word = "italy", syns = c("italy", "Italian"))
# , list(word = "newyork", syns = c("newyork", "newyorker"))
# , list(word = "Pakistan", syns = c("Pakistan", "Pakistani"))
# , list(word = "peru", syns = c("peru", "peruvian"))
# , list(word = "qatar", syns = c("qatar", "qatari"))
# , list(word = "scotland", syns = c("scotland", "scotish"))
# , list(word = "Shanghai", syns = c("Shanghai", "Shanzhai"))
# , list(word = "venezuela", syns = c("venezuela", "venezuelan"))
#
# # companies - needs to be data dependent
# # - e.g. ensure BNP in this experiment/feat always refers to BNPParibas
#
# # general synonyms
# , list(word = "Create", syns = c("Create","Creator"))
# , list(word = "cute", syns = c("cute","cutest"))
# , list(word = "Disappear", syns = c("Disappear","Fadeout"))
# , list(word = "teach", syns = c("teach", "taught"))
# , list(word = "theater", syns = c("theater", "theatre", "theatres"))
# , list(word = "understand", syns = c("understand", "understood"))
# , list(word = "weak", syns = c("weak", "weaken", "weaker", "weakest"))
# , list(word = "wealth", syns = c("wealth", "wealthi"))
#
# # custom synonyms (phrases)
#
# # custom synonyms (names)
# )
#glbFeatsTextSynonyms[["<txtFeat>"]] <- list(NULL
# , list(word="<stem1>", syns=c("<stem1>", "<stem1_2>"))
# )
for (txtFeat in names(glbFeatsTextSynonyms))
for (entryIx in 1:length(glbFeatsTextSynonyms[[txtFeat]])) {
glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$word <-
str_to_lower(glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$word)
glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$syns <-
str_to_lower(glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$syns)
}
glbFeatsTextSeed <- 181
# tm options include: check tm::weightSMART
glb_txt_terms_control <- list( # Gather model performance & run-time stats
# weighting = function(x) weightSMART(x, spec = "nnn")
# weighting = function(x) weightSMART(x, spec = "lnn")
# weighting = function(x) weightSMART(x, spec = "ann")
# weighting = function(x) weightSMART(x, spec = "bnn")
# weighting = function(x) weightSMART(x, spec = "Lnn")
#
weighting = function(x) weightSMART(x, spec = "ltn") # default
# weighting = function(x) weightSMART(x, spec = "lpn")
#
# weighting = function(x) weightSMART(x, spec = "ltc")
#
# weighting = weightBin
# weighting = weightTf
# weighting = weightTfIdf # : default
# termFreq selection criteria across obs: tm default: list(global=c(1, Inf))
, bounds = list(global = c(1, Inf))
# wordLengths selection criteria: tm default: c(3, Inf)
, wordLengths = c(1, Inf)
)
glb_txt_cor_var <- glb_rsp_var # : default # or c(<feat>)
# select one from c("union.top.val.cor", "top.cor", "top.val", default: "top.chisq", "sparse")
glbFeatsTextFilter <- "top.chisq"
glbFeatsTextTermsMax <- rep(10, length(glbFeatsText)) # :default
names(glbFeatsTextTermsMax) <- names(glbFeatsText)
# Text Processing Step: extractAssoc
glbFeatsTextAssocCor <- rep(1, length(glbFeatsText)) # :default
names(glbFeatsTextAssocCor) <- names(glbFeatsText)
# Remember to use stemmed terms
glb_important_terms <- list()
# Text Processing Step: extractPatterns (ngrams)
glbFeatsTextPatterns <- list()
#glbFeatsTextPatterns[[<txtFeat>>]] <- list()
#glbFeatsTextPatterns[[<txtFeat>>]] <- c(metropolitan.diary.colon = "Metropolitan Diary:")
# Have to set it even if it is not used
# Properties:
# numrows(glb_feats_df) << numrows(glbObsFit
# Select terms that appear in at least 0.2 * O(FP/FN(glbObsOOB)) ???
# numrows(glbObsOOB) = 1.1 * numrows(glbObsNew) ???
glb_sprs_thresholds <- NULL # or c(<txtFeat1> = 0.988, <txtFeat2> = 0.970, <txtFeat3> = 0.970)
glbFctrMaxUniqVals <- 20 # default: 20
glb_impute_na_data <- FALSE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_cluster <- FALSE # : default or TRUE
glb_cluster.seed <- 189 # or any integer
glb_cluster_entropy_var <- NULL # c(glb_rsp_var, as.factor(cut(glb_rsp_var, 3)), default: NULL)
glbFeatsTextClusterVarsExclude <- FALSE # default FALSE
glb_interaction_only_feats <- NULL # : default or c(<parent_feat> = "<child_feat>")
glbFeatsNzvFreqMax <- 19 # 19 : caret default
glbFeatsNzvUniqMin <- 10 # 10 : caret default
glbRFESizes <- list()
#glbRFESizes[["mdlFamily"]] <- c(4, 8, 16, 32, 64, 67, 68, 69) # Accuracy@69/70 = 0.8258
glbObsFitOutliers <- list()
# If outliers.n >= 10; consider concatenation of interaction vars
# glbObsFitOutliers[["<mdlFamily>"]] <- c(NULL
# is.na(.rstudent)
# is.na(.dffits)
# .hatvalues >= 0.99
# -38,167,642 < minmax(.rstudent) < 49,649,823
# , <comma-separated-<glbFeatsId>>
# )
glbObsTrnOutliers <- list()
# influence.measures: car::outlier; rstudent; dffits; hatvalues; dfbeta; dfbetas
#mdlId <- "RFE.X.glm"; obs_df <- fitobs_df
#mdlId <- "Final.glm"; obs_df <- trnobs_df
#mdlId <- "CSM2.X.glm"; obs_df <- fitobs_df
#print(outliers <- car::outlierTest(glb_models_lst[[mdlId]]$finalModel))
#mdlIdFamily <- paste0(head(unlist(str_split(mdlId, "\\.")), -1), collapse="."); obs_df <- dplyr::filter_(obs_df, interp(~(!(var %in% glbObsFitOutliers[[mdlIdFamily]])), var = as.name(glbFeatsId))); model_diags_df <- cbind(obs_df, data.frame(.rstudent=stats::rstudent(glb_models_lst[[mdlId]]$finalModel)), data.frame(.dffits=stats::dffits(glb_models_lst[[mdlId]]$finalModel)), data.frame(.hatvalues=stats::hatvalues(glb_models_lst[[mdlId]]$finalModel)));print(summary(model_diags_df[, c(".rstudent",".dffits",".hatvalues")])); table(cut(model_diags_df$.hatvalues, breaks=c(0.00, 0.98, 0.99, 1.00)))
#print(subset(model_diags_df, is.na(.rstudent))[, glbFeatsId])
#print(subset(model_diags_df, is.na(.dffits))[, glbFeatsId])
#print(model_diags_df[which.min(model_diags_df$.dffits), ])
#print(subset(model_diags_df, .hatvalues > 0.99)[, glbFeatsId])
#dffits_df <- merge(dffits_df, outliers_df, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#dffits_df <- merge(dffits_df, glbObsFit, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#subset(dffits_df, !is.na(.Bonf.p))
#mdlId <- "CSM.X.glm"; vars <- myextract_actual_feats(row.names(orderBy(reformulate(c("-", paste0(mdlId, ".imp"))), myget_feats_imp(glb_models_lst[[mdlId]]))));
#model_diags_df <- glb_get_predictions(model_diags_df, mdlId, glb_rsp_var)
#obs_ix <- row.names(model_diags_df) %in% names(outliers$rstudent)[1]
#obs_ix <- which(is.na(model_diags_df$.rstudent))
#obs_ix <- which(is.na(model_diags_df$.dffits))
#myplot_parcoord(obs_df=model_diags_df[, c(glbFeatsId, glbFeatsCategory, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, paste0(glb_rsp_var, mdlId), vars[1:min(20, length(vars))])], obs_ix=obs_ix, id_var=glbFeatsId, category_var=glbFeatsCategory)
#model_diags_df[row.names(model_diags_df) %in% names(outliers$rstudent)[c(1:2)], ]
#ctgry_diags_df <- model_diags_df[model_diags_df[, glbFeatsCategory] %in% c("Unknown#0"), ]
#myplot_parcoord(obs_df=ctgry_diags_df[, c(glbFeatsId, glbFeatsCategory, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:20])], obs_ix=row.names(ctgry_diags_df) %in% names(outliers$rstudent)[1], id_var=glbFeatsId, category_var=glbFeatsCategory)
#table(glbObsFit[model_diags_df[, glbFeatsCategory] %in% c("iPad1#1"), "startprice.log10.cut.fctr"])
#glbObsFit[model_diags_df[, glbFeatsCategory] %in% c("iPad1#1"), c(glbFeatsId, "startprice")]
# No outliers & .dffits == NaN
#myplot_parcoord(obs_df=model_diags_df[, c(glbFeatsId, glbFeatsCategory, glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:10])], obs_ix=seq(1:nrow(model_diags_df))[is.na(model_diags_df$.dffits)], id_var=glbFeatsId, category_var=glbFeatsCategory)
# Modify mdlId to (build & extract) "<FamilyId>#<Fit|Trn>#<caretMethod>#<preProc1.preProc2>#<samplingMethod>"
glb_models_lst <- list(); glb_models_df <- data.frame()
# Regression
if (glb_is_regression) {
glbMdlMethods <- c(NULL
# deterministic
#, "lm", # same as glm
, "glm", "bayesglm", "glmnet"
, "rpart"
# non-deterministic
, "gbm", "rf"
# Unknown
, "nnet" , "avNNet" # runs 25 models per cv sample for tunelength=5
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
, "bagEarth" # Takes a long time
)
} else
# Classification - Add ada (auto feature selection)
if (glb_is_binomial)
glbMdlMethods <- c(NULL
# deterministic
, "bagEarth" # Takes a long time
, "glm", "bayesglm", "glmnet"
, "nnet"
, "rpart"
# non-deterministic
, "gbm"
, "avNNet" # runs 25 models per cv sample for tunelength=5
, "rf"
# Unknown
, "lda", "lda2"
# svm models crash when predict is called -> internal to kernlab it should call predict without .outcome
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
) else
glbMdlMethods <- c(NULL
# deterministic
,"glmnet"
# non-deterministic
,"rf"
# Unknown
,"gbm","rpart"
)
glbMdlFamilies <- list(); glb_mdl_feats_lst <- list()
# family: Choose from c("RFE.X", "CSM.X", "All.X", "Best.Interact")
# methods: Choose from c(NULL, <method>, glbMdlMethods)
#glbMdlFamilies[["RFE.X"]] <- c("glmnet", "glm") # non-NULL vector is mandatory
glbMdlFamilies[["All.X"]] <- c("glmnet", "glm") # non-NULL vector is mandatory
#glbMdlFamilies[["Best.Interact"]] <- "glmnet" # non-NULL vector is mandatory
# Check if interaction features make RFE better
# glbMdlFamilies[["CSM.X"]] <- setdiff(glbMdlMethods, c("lda", "lda2")) # crashing due to category:.clusterid ??? #c("glmnet", "glm") # non-NULL list is mandatory
# glb_mdl_feats_lst[["CSM.X"]] <- c(NULL
# , <comma-separated-features-vector>
# )
# dAFeats.CSM.X %<d-% c(NULL
# # Interaction feats up to varImp(RFE.X.glmnet) >= 50
# , <comma-separated-features-vector>
# , setdiff(myextract_actual_feats(predictors(rfe_fit_results)), c(NULL
# , <comma-separated-features-vector>
# ))
# )
# glb_mdl_feats_lst[["CSM.X"]] <- "%<d-% dAFeats.CSM.X"
glbMdlFamilies[["Final"]] <- c(NULL) # NULL vector acceptable
glbMdlAllowParallel <- list()
#glbMdlAllowParallel[["<mdlId>"]] <- FALSE
glbMdlAllowParallel[["All.X##rcv#glm"]] <- FALSE
# Check if tuning parameters make fit better; make it mdlFamily customizable ?
glbMdlTuneParams <- data.frame()
# When glmnet crashes at model$grid with error: ???
glmnetTuneParams <- rbind(data.frame()
,data.frame(parameter = "alpha", vals = "0.100 0.325 0.550 0.775 1.000")
,data.frame(parameter = "lambda", vals = "9.342e-02")
)
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams,
# cbind(data.frame(mdlId = "<mdlId>"),
# glmnetTuneParams))
#avNNet
# size=[1] 3 5 7 9; decay=[0] 1e-04 0.001 0.01 0.1; bag=[FALSE]; RMSE=1.3300906
#bagEarth
# degree=1 [2] 3; nprune=64 128 256 512 [1024]; RMSE=0.6486663 (up)
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "bagEarth", parameter = "nprune", vals = "256")
# ,data.frame(method = "bagEarth", parameter = "degree", vals = "2")
# ))
#earth
# degree=[1]; nprune=2 [9] 17 25 33; RMSE=0.1334478
#gbm
# shrinkage=0.05 [0.10] 0.15 0.20 0.25; n.trees=100 150 200 [250] 300; interaction.depth=[1] 2 3 4 5; n.minobsinnode=[10]; RMSE=0.2008313
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "gbm", parameter = "shrinkage", min = 0.05, max = 0.25, by = 0.05)
# ,data.frame(method = "gbm", parameter = "n.trees", min = 100, max = 300, by = 50)
# ,data.frame(method = "gbm", parameter = "interaction.depth", min = 1, max = 5, by = 1)
# ,data.frame(method = "gbm", parameter = "n.minobsinnode", min = 10, max = 10, by = 10)
# #seq(from=0.05, to=0.25, by=0.05)
# ))
#glmnet
# alpha=0.100 [0.325] 0.550 0.775 1.000; lambda=0.0005232693 0.0024288010 0.0112734954 [0.0523269304] 0.2428800957; RMSE=0.6164891
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha", vals = "0.550 0.775 0.8875 0.94375 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda", vals = "9.858855e-05 0.0001971771 0.0009152152 0.0042480525 0.0197177130")
# ))
#nnet
# size=3 5 [7] 9 11; decay=0.0001 0.001 0.01 [0.1] 0.2; RMSE=0.9287422
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "nnet", parameter = "size", vals = "3 5 7 9 11")
# ,data.frame(method = "nnet", parameter = "decay", vals = "0.0001 0.0010 0.0100 0.1000 0.2000")
# ))
#rf # Don't bother; results are not deterministic
# mtry=2 35 68 [101] 134; RMSE=0.1339974
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "rf", parameter = "mtry", vals = "2 5 9 13 17")
# ))
#rpart
# cp=0.020 [0.025] 0.030 0.035 0.040; RMSE=0.1770237
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "rpart", parameter = "cp", vals = "0.004347826 0.008695652 0.017391304 0.021739130 0.034782609")
# ))
#svmLinear
# C=0.01 0.05 [0.10] 0.50 1.00 2.00 3.00 4.00; RMSE=0.1271318; 0.1296718
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "svmLinear", parameter = "C", vals = "0.01 0.05 0.1 0.5 1")
# ))
#svmLinear2
# cost=0.0625 0.1250 [0.25] 0.50 1.00; RMSE=0.1276354
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "svmLinear2", parameter = "cost", vals = "0.0625 0.125 0.25 0.5 1")
# ))
#svmPoly
# degree=[1] 2 3 4 5; scale=0.01 0.05 [0.1] 0.5 1; C=0.50 1.00 [2.00] 3.00 4.00; RMSE=0.1276130
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method="svmPoly", parameter="degree", min=1, max=5, by=1) #seq(1, 5, 1)
# ,data.frame(method="svmPoly", parameter="scale", vals="0.01, 0.05, 0.1, 0.5, 1")
# ,data.frame(method="svmPoly", parameter="C", vals="0.50, 1.00, 2.00, 3.00, 4.00")
# ))
#svmRadial
# sigma=[0.08674323]; C=0.25 0.50 1.00 [2.00] 4.00; RMSE=0.1614957
#glb2Sav(); all.equal(sav_models_df, glb_models_df)
glb_preproc_methods <- NULL
# c("YeoJohnson", "center.scale", "range", "pca", "ica", "spatialSign")
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<feat>")
glbMdlMetric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glbMdlMetricSummary <- NULL # or "<metric_name>"
glbMdlMetricMaximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glbMdlMetricSummaryFn <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glbMdlMetric_terms)
# metric <- sum(confusion_mtrx * glbMdlMetric_terms) / nrow(data)
# names(metric) <- glbMdlMetricSummary
# return(metric)
# }
glbMdlCheckRcv <- FALSE # Turn it on when needed; otherwise takes long time
glb_rcv_n_folds <- 3 # or NULL
glb_rcv_n_repeats <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glbMdlMetricsEval <- c("min.RMSE.OOB", "max.R.sq.OOB", "max.Adj.R.sq.fit", "min.RMSE.fit")
#glbMdlMetricsEval <- c("min.RMSE.fit", "max.R.sq.fit", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glbMdlMetricsEval <-
c("max.Accuracy.OOB", "max.AUCROCR.OOB", "max.AUCpROC.OOB", "min.aic.fit", "max.Accuracy.fit") else
glbMdlMetricsEval <- c("max.Accuracy.OOB", "max.Kappa.OOB")
}
# select from NULL [no ensemble models], "auto" [all models better than MFO or Baseline], c(mdl_ids in glb_models_lst) [Typically top-rated models in auto]
glb_mdl_ensemble <- NULL
# "%<d-% setdiff(mygetEnsembleAutoMdlIds(), 'CSM.X.rf')"
# c(<comma-separated-mdlIds>
# )
# Only for classifications; for regressions remove "(.*)\\.prob" form the regex
# tmp_fitobs_df <- glbObsFit[, grep(paste0("^", gsub(".", "\\.", mygetPredictIds$value, fixed = TRUE), "CSM\\.X\\.(.*)\\.prob"), names(glbObsFit), value = TRUE)]; cor_mtrx <- cor(tmp_fitobs_df); cor_vctr <- sort(cor_mtrx[row.names(orderBy(~-Overall, varImp(glb_models_lst[["Ensemble.repeatedcv.glmnet"]])$imp))[1], ]); summary(cor_vctr); cor_vctr
#ntv.glm <- glm(reformulate(indep_vars, glb_rsp_var), family = "binomial", data = glbObsFit)
#step.glm <- step(ntv.glm)
glb_sel_mdl_id <- "All.X##rcv#glmnet" #select from c(NULL, "All.X##rcv#glmnet", "RFE.X##rcv#glmnet", <mdlId>)
glb_fin_mdl_id <- NULL #select from c(NULL, glb_sel_mdl_id)
glb_dsp_cols <- c(".pos", glbFeatsId, glbFeatsCategory, glb_rsp_var
# List critical cols excl. above
,"left_eye_center_x","left_eye_center_y"
)
# Output specs
glbObsOut <- list(NULL
# glbFeatsId will be the first output column, by default
,vars = list()
,mapFn = function(obsout_df) {
require(tidyr)
smpout_df <- read.csv('data/IdLookupTable.csv')
tmpout_df <- obsout_df %>%
tidyr::gather(key = FeatureName, value = Location, -ImageId) %>%
merge(smpout_df[, -4], all.y = TRUE, sort = FALSE) %>%
select(matches("(RowId|Location)"))
return(tmpout_df <- orderBy(~RowId, tmpout_df[, c("RowId", "Location")]))
}
)
glb_out_obs <- NULL # select from c(NULL : default to "new", "all", "new", "trn")
if (glb_is_classification && glb_is_binomial) {
glbObsOut$vars[["Probability1"]] <-
"%<d-% glbObsNew[, mygetPredictIds(glb_rsp_var, glb_fin_mdl_id)$prob]"
} else {
glbObsOut$vars[[glbFeatsId]] <-
"%<d-% as.integer(gsub('Test#', '', glbObsNew[, glbFeatsId]))"
glbObsOut$vars[[glb_rsp_var]] <-
"%<d-% glbObsNew[, mygetPredictIds(glb_rsp_var, glb_fin_mdl_id)$value]"
for (outVar in setdiff(glbFeatsExcludeLcl, glb_rsp_var_raw))
glbObsOut$vars[[outVar]] <-
paste0("%<d-% mean(glbObsAll[, \"", outVar, "\"], na.rm = TRUE)")
}
# glbObsOut$vars[[glb_rsp_var_raw]] <- glb_rsp_var_raw
# glbObsOut$vars[[paste0(head(unlist(strsplit(mygetPredictIds$value, "")), -1), collapse = "")]] <-
glbOutStackFnames <- NULL #: default
# c("ebayipads_txt_assoc1_out_bid1_stack.csv") # manual stack
# c("ebayipads_finmdl_bid1_out_nnet_1.csv") # universal stack
glbOut <- list(pfx = "Faces_patch_mean_datafix_")
lclImageSampleSeed <- 129
glbOutDataVizFname <- NULL # choose from c(NULL, "<projectId>_obsall.csv")
glbChunks <- list(labels = c("set_global_options_wd","set_global_options"
,"import.data","inspect.data","scrub.data","transform.data"
,"extract.features"
,"extract.features.datetime","extract.features.image","extract.features.price"
,"extract.features.text","extract.features.string"
,"extract.features.end"
,"manage.missing.data","cluster.data","partition.data.training","select.features"
,"fit.models_0","fit.models_1","fit.models_2","fit.models_3"
,"fit.data.training_0","fit.data.training_1"
,"predict.data.new"
,"display.session.info"))
# To ensure that all chunks in this script are in glbChunks
if (!is.null(chkChunksLabels <- knitr::all_labels()) && # knitr::all_labels() doesn't work in console runs
!identical(chkChunksLabels, glbChunks$labels)) {
print(sprintf("setdiff(chkChunksLabels, glbChunks$labels): %s",
setdiff(chkChunksLabels, glbChunks$labels)))
print(sprintf("setdiff(glbChunks$labels, chkChunksLabels): %s",
setdiff(glbChunks$labels, chkChunksLabels)))
}
glbChunks[["first"]] <- NULL #default: script will load envir from previous chunk
glbChunks[["last"]] <- "extract.features.end" #NULL #default: script will save envir at end of this chunk
#mysavChunk(glbOut$pfx, glbChunks[["last"]])
# Inspect max OOB FP
#chkObsOOB <- subset(glbObsOOB, !label.fctr.All.X..rcv.glmnet.is.acc)
#chkObsOOBFP <- subset(chkObsOOB, label.fctr.All.X..rcv.glmnet == "left_eye_center") %>% dplyr::mutate(Probability1 = label.fctr.All.X..rcv.glmnet.prob) %>% select(-.src, -.pos, -x, -y) %>% lclgetfltout_df() %>% mutate(obj.distance = (((as.numeric(x) - left_eye_center_x.int) ^ 2) + ((as.numeric(y) - left_eye_center_y.int) ^ 2)) ^ 0.5) %>% dplyr::top_n(5, obj.distance) %>% dplyr::top_n(5, -patch.cor)
#
#newImgObs <- glbObsNew[(glbObsNew$ImageId == "Test#0001"), ]; print(newImgObs[which.max(newImgObs$label.fctr.Final..rcv.glmnet.prob), ])
#OOBImgObs <- glbObsOOB[(glbObsOOB$ImageId == "Train#0003"), ]; print(OOBImgObs[which.max(OOBImgObs$label.fctr.All.X..rcv.glmnet.prob), ])
#load("Faces_patch_mean_datafix_extract.features.end.RData", verbose = TRUE)
#mygetImage(which(glbObsAll[, glbFeatsId] == "Train#0003"), names(glbFeatsImage)[1], plot = TRUE, featHighlight = c("left_eye_center_x", "left_eye_center_y"), ovrlHighlight = c(66, 35))
# Depict process
glb_analytics_pn <- petrinet(name = "glb_analytics_pn",
trans_df = data.frame(id = 1:6,
name = c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df = data.frame(
begin = c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end = c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL, "import.data")
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 11.547 NA NA
1.0: import data## [1] "Reading file ./data/training/training.csv..."
## [1] "dimensions of data in ./data/training/training.csv: 7,049 rows x 31 cols"
## [1] " Truncating Image to first 100 chars..."
## left_eye_center_x left_eye_center_y right_eye_center_x
## 1 66.03356 39.00227 30.22701
## 2 64.33294 34.97008 29.94928
## 3 65.05705 34.90964 30.90379
## 4 65.22574 37.26177 32.02310
## 5 66.72530 39.62126 32.24481
## 6 69.68075 39.96875 29.18355
## right_eye_center_y left_eye_inner_corner_x left_eye_inner_corner_y
## 1 36.42168 59.58208 39.64742
## 2 33.44871 58.85617 35.27435
## 3 34.90964 59.41200 36.32097
## 4 37.26177 60.00334 39.12718
## 5 38.04203 58.56589 39.62126
## 6 37.56336 62.86430 40.16927
## left_eye_outer_corner_x left_eye_outer_corner_y right_eye_inner_corner_x
## 1 73.13035 39.97000 36.35657
## 2 70.72272 36.18717 36.03472
## 3 70.98442 36.32097 37.67811
## 4 72.31471 38.38097 37.61864
## 5 72.51593 39.88447 36.98238
## 6 76.89824 41.17189 36.40105
## right_eye_inner_corner_y right_eye_outer_corner_x
## 1 37.38940 23.45287
## 2 34.36153 24.47251
## 3 36.32097 24.97642
## 4 38.75411 25.30727
## 5 39.09485 22.50611
## 6 39.36763 21.76553
## right_eye_outer_corner_y left_eyebrow_inner_end_x
## 1 37.38940 56.95326
## 2 33.14444 53.98740
## 3 36.60322 55.74253
## 4 38.00790 56.43381
## 5 38.30524 57.24957
## 6 38.56553 59.76628
## left_eyebrow_inner_end_y left_eyebrow_outer_end_x
## 1 29.03365 80.22713
## 2 28.27595 78.63421
## 3 27.57095 78.88737
## 4 30.92986 77.91026
## 5 30.67218 77.76294
## 6 31.65129 83.31364
## left_eyebrow_outer_end_y right_eyebrow_inner_end_x
## 1 32.22814 40.22761
## 2 30.40592 42.72885
## 3 32.65162 42.19389
## 4 31.66573 41.67151
## 5 31.73725 38.03544
## 6 35.35806 39.40800
## right_eyebrow_inner_end_y right_eyebrow_outer_end_x
## 1 29.00232 16.35638
## 2 26.14604 16.86536
## 3 28.13545 16.79116
## 4 31.04999 20.45802
## 5 30.93538 15.92587
## 6 30.54639 14.94908
## right_eyebrow_outer_end_y nose_tip_x nose_tip_y mouth_left_corner_x
## 1 29.64747 44.42057 57.06680 61.19531
## 2 27.05886 48.20630 55.66094 56.42145
## 3 32.08712 47.55726 53.53895 60.82295
## 4 29.90934 51.88508 54.16654 65.59889
## 5 30.67218 43.29953 64.88952 60.67141
## 6 32.15013 52.46849 58.80000 64.86908
## mouth_left_corner_y mouth_right_corner_x mouth_right_corner_y
## 1 79.97017 28.61450 77.38899
## 2 76.35200 35.12238 76.04766
## 3 73.01432 33.72632 72.73200
## 4 72.70372 37.24550 74.19548
## 5 77.52324 31.19175 76.99730
## 6 82.47118 31.99043 81.66908
## mouth_center_top_lip_x mouth_center_top_lip_y mouth_center_bottom_lip_x
## 1 43.31260 72.93546 43.13071
## 2 46.68460 70.26655 45.46791
## 3 47.27495 70.19179 47.27495
## 4 50.30317 70.09169 51.56118
## 5 44.96275 73.70739 44.22714
## 6 49.30811 78.48763 49.43237
## mouth_center_bottom_lip_y
## 1 84.48577
## 2 85.48017
## 3 78.65937
## 4 78.26838
## 5 86.87117
## 6 93.89877
## Image
## 1 238 236 237 238 240 240 239 241 241 243 240 239 231 212 190 173 148 122 104 92 79 73 74 73 73 74 81
## 2 219 215 204 196 204 211 212 200 180 168 178 196 194 196 203 209 199 192 197 201 207 215 199 190 182
## 3 144 142 159 180 188 188 184 180 167 132 84 59 54 57 62 61 55 54 56 50 60 78 85 86 88 89 90 90 88 89
## 4 193 192 193 194 194 194 193 192 168 111 50 12 1 1 1 1 1 1 1 1 1 1 6 16 19 17 13 13 16 22 25 31 34 27
## 5 147 148 160 196 215 214 216 217 219 220 206 188 166 104 88 81 77 71 63 58 58 52 58 62 59 60 55 51 57
## 6 167 169 170 167 156 145 106 68 52 24 20 15 21 14 6 9 11 11 29 49 61 71 76 80 82 84 84 84 83 88 91 92
## left_eye_center_x left_eye_center_y right_eye_center_x
## 244 63.76497 38.17976 24.46709
## 1074 67.61736 35.73515 32.32068
## 3590 68.68139 35.63807 30.43863
## 5129 65.56684 37.63803 33.38537
## 5183 68.42459 43.83568 35.01755
## 6975 47.85052 37.39213 26.10544
## right_eye_center_y left_eye_inner_corner_x left_eye_inner_corner_y
## 244 40.70400 57.27570 38.90097
## 1074 36.64797 61.53191 36.64797
## 3590 38.30054 NA NA
## 5129 38.01223 NA NA
## 5183 39.69364 NA NA
## 6975 40.37090 NA NA
## left_eye_outer_corner_x left_eye_outer_corner_y
## 244 71.69667 38.90097
## 1074 73.70281 35.43088
## 3590 NA NA
## 5129 NA NA
## 5183 NA NA
## 6975 NA NA
## right_eye_inner_corner_x right_eye_inner_corner_y
## 244 31.31697 39.98279
## 1074 38.71047 37.25651
## 3590 NA NA
## 5129 NA NA
## 5183 NA NA
## 6975 NA NA
## right_eye_outer_corner_x right_eye_outer_corner_y
## 244 17.25661 41.78501
## 1074 25.62655 36.95224
## 3590 NA NA
## 5129 NA NA
## 5183 NA NA
## 6975 NA NA
## left_eyebrow_inner_end_x left_eyebrow_inner_end_y
## 244 54.39086 29.52766
## 1074 59.73991 31.37784
## 3590 NA NA
## 5129 NA NA
## 5183 NA NA
## 6975 NA NA
## left_eyebrow_outer_end_x left_eyebrow_outer_end_y
## 244 76.74434 27.36403
## 1074 77.04987 28.43248
## 3590 NA NA
## 5129 NA NA
## 5183 NA NA
## 6975 NA NA
## right_eyebrow_inner_end_x right_eyebrow_inner_end_y
## 244 33.84121 31.69049
## 1074 43.01889 32.12017
## 3590 NA NA
## 5129 NA NA
## 5183 NA NA
## 6975 NA NA
## right_eyebrow_outer_end_x right_eyebrow_outer_end_y nose_tip_x
## 244 11.84834 33.49351 43.93573
## 1074 21.97583 32.08381 51.18638
## 3590 NA NA 51.01229
## 5129 NA NA 49.47608
## 5183 NA NA 51.17158
## 6975 NA NA 33.25451
## nose_tip_y mouth_left_corner_x mouth_left_corner_y
## 244 52.96215 60.52034 76.75644
## 1074 57.33855 66.40000 72.24851
## 3590 59.60032 NA NA
## 5129 63.08383 NA NA
## 5183 64.96020 NA NA
## 6975 59.13721 NA NA
## mouth_right_corner_x mouth_right_corner_y mouth_center_top_lip_x
## 244 35.64343 78.55946 46.81976
## 1074 36.27643 72.55285 51.49072
## 3590 NA NA NA
## 5129 NA NA NA
## 5183 NA NA NA
## 6975 NA NA NA
## mouth_center_top_lip_y mouth_center_bottom_lip_x
## 244 70.62776 47.90158
## 1074 72.24851 51.49072
## 3590 NA 52.46453
## 5129 NA 49.32317
## 5183 NA 49.51476
## 6975 NA 38.31844
## mouth_center_bottom_lip_y
## 244 83.96773
## 1074 80.76800
## 3590 67.58770
## 5129 72.81309
## 5183 75.72958
## 6975 76.71200
## Image
## 244 41 36 34 33 41 47 43 38 37 39 40 35 27 23 27 31 32 28 26 29 35 38 37 39 42 41 40 42 41 39 44 50 51 4
## 1074 202 201 202 202 201 201 201 201 184 96 36 30 30 35 41 54 95 158 191 194 194 195 195 193 191 188 189
## 3590 219 219 217 220 228 225 223 223 224 224 226 226 227 223 218 220 225 224 220 207 206 208 203 211 220
## 5129 194 196 197 198 197 194 192 188 189 196 108 53 69 51 48 35 34 19 33 45 31 17 25 15 12 19 23 27 29 29
## 5183 140 137 127 118 111 104 105 111 115 116 117 111 104 99 93 90 93 96 93 90 91 95 101 114 126 137 150 1
## 6975 31 28 27 31 38 46 62 70 82 90 93 92 88 85 83 75 71 65 57 50 41 33 28 24 23 25 28 32 35 38 38 36 32 2
## left_eye_center_x left_eye_center_y right_eye_center_x
## 7044 66.86722 37.35686 30.75093
## 7045 67.40255 31.84255 29.74675
## 7046 66.13440 38.36550 30.47863
## 7047 66.69073 36.84522 31.66642
## 7048 70.96508 39.85367 30.54328
## 7049 66.93831 43.42451 31.09606
## right_eye_center_y left_eye_inner_corner_x left_eye_inner_corner_y
## 7044 40.11574 NA NA
## 7045 38.63294 NA NA
## 7046 39.95020 NA NA
## 7047 39.68504 NA NA
## 7048 40.77234 NA NA
## 7049 39.52860 NA NA
## left_eye_outer_corner_x left_eye_outer_corner_y
## 7044 NA NA
## 7045 NA NA
## 7046 NA NA
## 7047 NA NA
## 7048 NA NA
## 7049 NA NA
## right_eye_inner_corner_x right_eye_inner_corner_y
## 7044 NA NA
## 7045 NA NA
## 7046 NA NA
## 7047 NA NA
## 7048 NA NA
## 7049 NA NA
## right_eye_outer_corner_x right_eye_outer_corner_y
## 7044 NA NA
## 7045 NA NA
## 7046 NA NA
## 7047 NA NA
## 7048 NA NA
## 7049 NA NA
## left_eyebrow_inner_end_x left_eyebrow_inner_end_y
## 7044 NA NA
## 7045 NA NA
## 7046 NA NA
## 7047 NA NA
## 7048 NA NA
## 7049 NA NA
## left_eyebrow_outer_end_x left_eyebrow_outer_end_y
## 7044 NA NA
## 7045 NA NA
## 7046 NA NA
## 7047 NA NA
## 7048 NA NA
## 7049 NA NA
## right_eyebrow_inner_end_x right_eyebrow_inner_end_y
## 7044 NA NA
## 7045 NA NA
## 7046 NA NA
## 7047 NA NA
## 7048 NA NA
## 7049 NA NA
## right_eyebrow_outer_end_x right_eyebrow_outer_end_y nose_tip_x
## 7044 NA NA 43.54211
## 7045 NA NA 48.26596
## 7046 NA NA 47.91035
## 7047 NA NA 49.46257
## 7048 NA NA 50.75420
## 7049 NA NA 47.06925
## nose_tip_y mouth_left_corner_x mouth_left_corner_y
## 7044 64.94569 NA NA
## 7045 67.02909 NA NA
## 7046 66.62601 NA NA
## 7047 67.51516 NA NA
## 7048 66.72499 NA NA
## 7049 73.03334 NA NA
## mouth_right_corner_x mouth_right_corner_y mouth_center_top_lip_x
## 7044 NA NA NA
## 7045 NA NA NA
## 7046 NA NA NA
## 7047 NA NA NA
## 7048 NA NA NA
## 7049 NA NA NA
## mouth_center_top_lip_y mouth_center_bottom_lip_x
## 7044 NA 47.55504
## 7045 NA 50.42664
## 7046 NA 50.28740
## 7047 NA 49.46257
## 7048 NA 50.06519
## 7049 NA 45.90048
## mouth_center_bottom_lip_y
## 7044 79.49255
## 7045 79.68392
## 7046 77.98302
## 7047 78.11712
## 7048 79.58645
## 7049 82.77310
## Image
## 7044 150 150 132 63 44 74 86 61 62 57 44 70 93 115 114 115 99 110 94 108 108 94 97 86 79 75 101 90 93 89
## 7045 71 74 85 105 116 128 139 150 170 187 201 209 218 219 212 198 184 181 185 188 193 196 199 202 206 208
## 7046 60 60 62 57 55 51 49 48 50 53 56 56 106 89 77 98 100 107 106 90 90 94 88 94 103 118 123 126 123 144
## 7047 74 74 74 78 79 79 79 81 77 78 80 73 72 81 77 120 184 191 193 172 194 203 203 202 198 199 207 214 214
## 7048 254 254 254 254 254 238 193 145 121 118 119 109 106 106 105 107 109 111 113 117 126 129 129 129 129
## 7049 53 62 67 76 86 91 97 105 105 106 107 108 112 117 123 129 130 128 132 134 136 142 149 155 157 157 153
## 'data.frame': 7049 obs. of 20 variables:
## $ left_eye_center_x : num 66 64.3 65.1 65.2 66.7 ...
## $ left_eye_center_y : num 39 35 34.9 37.3 39.6 ...
## $ right_eye_center_x : num 30.2 29.9 30.9 32 32.2 ...
## $ right_eye_center_y : num 36.4 33.4 34.9 37.3 38 ...
## $ left_eye_inner_corner_x : num 59.6 58.9 59.4 60 58.6 ...
## $ left_eye_inner_corner_y : num 39.6 35.3 36.3 39.1 39.6 ...
## $ left_eye_outer_corner_x : num 73.1 70.7 71 72.3 72.5 ...
## $ left_eye_outer_corner_y : num 40 36.2 36.3 38.4 39.9 ...
## $ right_eye_inner_corner_x : num 36.4 36 37.7 37.6 37 ...
## $ right_eye_inner_corner_y : num 37.4 34.4 36.3 38.8 39.1 ...
## $ right_eye_outer_corner_x : num 23.5 24.5 25 25.3 22.5 ...
## $ right_eye_outer_corner_y : num 37.4 33.1 36.6 38 38.3 ...
## $ left_eyebrow_inner_end_x : num 57 54 55.7 56.4 57.2 ...
## $ left_eyebrow_inner_end_y : num 29 28.3 27.6 30.9 30.7 ...
## $ left_eyebrow_outer_end_x : num 80.2 78.6 78.9 77.9 77.8 ...
## $ left_eyebrow_outer_end_y : num 32.2 30.4 32.7 31.7 31.7 ...
## $ right_eyebrow_inner_end_x: num 40.2 42.7 42.2 41.7 38 ...
## $ right_eyebrow_inner_end_y: num 29 26.1 28.1 31 30.9 ...
## $ right_eyebrow_outer_end_x: num 16.4 16.9 16.8 20.5 15.9 ...
## $ right_eyebrow_outer_end_y: num 29.6 27.1 32.1 29.9 30.7 ...
## NULL
## 'data.frame': 7049 obs. of 21 variables:
## $ right_eye_outer_corner_x : num 23.5 24.5 25 25.3 22.5 ...
## $ right_eye_outer_corner_y : num 37.4 33.1 36.6 38 38.3 ...
## $ left_eyebrow_inner_end_x : num 57 54 55.7 56.4 57.2 ...
## $ left_eyebrow_inner_end_y : num 29 28.3 27.6 30.9 30.7 ...
## $ left_eyebrow_outer_end_x : num 80.2 78.6 78.9 77.9 77.8 ...
## $ left_eyebrow_outer_end_y : num 32.2 30.4 32.7 31.7 31.7 ...
## $ right_eyebrow_inner_end_x: num 40.2 42.7 42.2 41.7 38 ...
## $ right_eyebrow_inner_end_y: num 29 26.1 28.1 31 30.9 ...
## $ right_eyebrow_outer_end_x: num 16.4 16.9 16.8 20.5 15.9 ...
## $ right_eyebrow_outer_end_y: num 29.6 27.1 32.1 29.9 30.7 ...
## $ nose_tip_x : num 44.4 48.2 47.6 51.9 43.3 ...
## $ nose_tip_y : num 57.1 55.7 53.5 54.2 64.9 ...
## $ mouth_left_corner_x : num 61.2 56.4 60.8 65.6 60.7 ...
## $ mouth_left_corner_y : num 80 76.4 73 72.7 77.5 ...
## $ mouth_right_corner_x : num 28.6 35.1 33.7 37.2 31.2 ...
## $ mouth_right_corner_y : num 77.4 76 72.7 74.2 77 ...
## $ mouth_center_top_lip_x : num 43.3 46.7 47.3 50.3 45 ...
## $ mouth_center_top_lip_y : num 72.9 70.3 70.2 70.1 73.7 ...
## $ mouth_center_bottom_lip_x: num 43.1 45.5 47.3 51.6 44.2 ...
## $ mouth_center_bottom_lip_y: num 84.5 85.5 78.7 78.3 86.9 ...
## $ Image : chr "238 236 237 238 240 240 239 241 241 243 240 239 231 212 190 173 148 122 104 92 79 73 74 73 73 74 81 74 60 64 75 86 93 102 100 1"| __truncated__ "219 215 204 196 204 211 212 200 180 168 178 196 194 196 203 209 199 192 197 201 207 215 199 190 182 180 183 190 190 176 175 175"| __truncated__ "144 142 159 180 188 188 184 180 167 132 84 59 54 57 62 61 55 54 56 50 60 78 85 86 88 89 90 90 88 89 91 94 95 98 99 101 104 107 "| __truncated__ "193 192 193 194 194 194 193 192 168 111 50 12 1 1 1 1 1 1 1 1 1 1 6 16 19 17 13 13 16 22 25 31 34 27 15 19 16 19 17 13 9 6 3 1 "| __truncated__ ...
## NULL
## Warning in myprint_str_df(df): [list output truncated]
## [1] "Reading file ./data/test/test.csv..."
## [1] "dimensions of data in ./data/test/test.csv: 1,783 rows x 2 cols"
## [1] " Truncating Image to first 100 chars..."
## ImageId
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## Image
## 1 182 183 182 182 180 180 176 169 156 137 124 103 79 62 54 56 58 48 49 45 39 37 42 43 52 61 78 93 104
## 2 76 87 81 72 65 59 64 76 69 42 31 38 49 58 58 47 37 33 32 33 35 50 55 54 50 51 61 78 92 100 101 79 55
## 3 177 176 174 170 169 169 168 166 166 166 161 140 69 5 1 2 1 18 61 96 110 122 129 129 127 125 125 119
## 4 176 174 174 175 174 174 176 176 175 171 165 157 143 134 134 137 138 137 135 135 134 137 135 128 128
## 5 50 47 44 101 144 149 120 58 48 42 35 35 37 39 38 36 34 31 31 32 32 34 34 34 35 33 32 30 31 33 33 31
## 6 177 177 177 171 142 115 97 84 89 90 88 82 63 51 40 35 39 37 42 38 29 35 43 64 95 117 127 115 108 125
## ImageId
## 3 3
## 319 319
## 691 691
## 698 698
## 717 717
## 824 824
## Image
## 3 177 176 174 170 169 169 168 166 166 166 161 140 69 5 1 2 1 18 61 96 110 122 129 129 127 125 125 119
## 319 33 34 38 39 37 32 29 26 24 24 24 23 26 46 65 68 73 77 90 99 100 107 111 113 117 121 128 138 148 154
## 691 34 32 34 43 38 23 8 15 18 19 19 39 47 45 30 43 51 50 44 40 37 36 37 37 37 39 41 43 48 50 53 57 59 62
## 698 14 14 15 16 18 21 23 25 27 29 30 31 31 33 34 36 39 45 60 73 81 89 97 108 115 121 126 128 129 127 124
## 717 17 14 21 20 17 40 77 93 103 121 150 165 153 144 144 118 90 112 132 155 167 170 172 176 181 179 182 1
## 824 86 110 151 194 223 197 177 158 149 144 181 207 216 206 185 163 142 128 117 109 83 53 54 57 63 71 80
## ImageId
## 1778 1778
## 1779 1779
## 1780 1780
## 1781 1781
## 1782 1782
## 1783 1783
## Image
## 1778 100 106 105 106 105 104 104 108 112 114 111 108 108 111 113 111 108 117 130 114 114 135 108 87 91 82
## 1779 101 101 101 100 100 97 97 98 102 149 214 206 171 159 159 162 170 178 171 171 171 171 170 164 163 175
## 1780 201 191 171 158 145 140 136 130 123 115 108 104 100 96 99 115 132 155 167 174 170 160 159 158 166 17
## 1781 28 28 29 30 31 32 33 34 39 44 46 46 49 54 61 73 84 97 110 119 128 133 137 138 139 140 144 146 147 14
## 1782 104 95 71 57 46 52 65 70 70 67 76 72 69 69 72 75 73 68 81 67 58 35 33 41 27 20 13 28 39 53 70 75 80
## 1783 63 61 64 66 66 64 65 70 69 70 77 83 63 34 22 21 21 18 23 12 17 22 24 37 32 15 15 20 20 15 9 9 9 8 9
## 'data.frame': 1783 obs. of 2 variables:
## $ ImageId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Image : chr "182 183 182 182 180 180 176 169 156 137 124 103 79 62 54 56 58 48 49 45 39 37 42 43 52 61 78 93 104 107 114 115 117 122 120 122"| __truncated__ "76 87 81 72 65 59 64 76 69 42 31 38 49 58 58 47 37 33 32 33 35 50 55 54 50 51 61 78 92 100 101 79 55 47 52 50 47 39 38 52 46 25"| __truncated__ "177 176 174 170 169 169 168 166 166 166 161 140 69 5 1 2 1 18 61 96 110 122 129 129 127 125 125 119 112 110 111 107 102 102 99 "| __truncated__ "176 174 174 175 174 174 176 176 175 171 165 157 143 134 134 137 138 137 135 135 134 137 135 128 128 129 122 110 107 112 115 123"| __truncated__ ...
## - attr(*, "comment")= chr "glbObsNew"
## NULL
## [1] "Creating new feature: .pos..."
## [1] "Creating new feature: .pos.y..."
## [1] "Creating new feature: ImageId..."
## [1] "Creating new feature: left_eye_center_x..."
## [1] "Creating new feature: left_eye_center_y..."
## [1] "Creating new feature: Image.pxl.1.dgt.1..."
## [1] "Partition stats:"
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
## .pos.y.cut.fctr .src .n
## 1 (-7.83,2.94e+03] Train 2944
## 2 (2.94e+03,5.89e+03] Train 2944
## 3 (5.89e+03,8.84e+03] Test 1783
## 4 (5.89e+03,8.84e+03] Train 1161
## .pos.y.cut.fctr .src .n
## 1 (-7.83,2.94e+03] Train 2944
## 2 (2.94e+03,5.89e+03] Train 2944
## 3 (5.89e+03,8.84e+03] Test 1783
## 4 (5.89e+03,8.84e+03] Train 1161
## .src .n
## 1 Train 7049
## 2 Test 1783
## Loading required package: lazyeval
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## [1] "Found 0 duplicates by all features:"
## NULL
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 11.547 78.872 67.325
## 2 inspect.data 2 0 0 78.873 NA NA
2.0: inspect data## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "numeric data missing in glbObsAll: "
## left_eye_center_x left_eye_center_y
## 1793 1793
## right_eye_center_x right_eye_center_y
## 1796 1796
## left_eye_inner_corner_x left_eye_inner_corner_y
## 6561 6561
## left_eye_outer_corner_x left_eye_outer_corner_y
## 6565 6565
## right_eye_inner_corner_x right_eye_inner_corner_y
## 6564 6564
## right_eye_outer_corner_x right_eye_outer_corner_y
## 6564 6564
## left_eyebrow_inner_end_x left_eyebrow_inner_end_y
## 6562 6562
## left_eyebrow_outer_end_x left_eyebrow_outer_end_y
## 6607 6607
## right_eyebrow_inner_end_x right_eyebrow_inner_end_y
## 6562 6562
## right_eyebrow_outer_end_x right_eyebrow_outer_end_y
## 6596 6596
## nose_tip_x nose_tip_y
## 1783 1783
## mouth_left_corner_x mouth_left_corner_y
## 6563 6563
## mouth_right_corner_x mouth_right_corner_y
## 6562 6562
## mouth_center_top_lip_x mouth_center_top_lip_y
## 6557 6557
## mouth_center_bottom_lip_x mouth_center_bottom_lip_y
## 1816 1816
## [1] "numeric data w/ 0s in glbObsAll: "
## named integer(0)
## [1] "numeric data w/ Infs in glbObsAll: "
## named integer(0)
## [1] "numeric data w/ NaNs in glbObsAll: "
## named integer(0)
## [1] "string data missing in glbObsAll: "
## Image ImageId Image.pxl.1.dgt.1
## 0 0 0
## label step_major step_minor label_minor bgn end elapsed
## 2 inspect.data 2 0 0 78.873 84.806 5.933
## 3 scrub.data 2 1 1 84.806 NA NA
2.1: scrub data## [1] "numeric data missing in glbObsAll: "
## left_eye_center_x left_eye_center_y
## 1793 1793
## right_eye_center_x right_eye_center_y
## 1796 1796
## left_eye_inner_corner_x left_eye_inner_corner_y
## 6561 6561
## left_eye_outer_corner_x left_eye_outer_corner_y
## 6565 6565
## right_eye_inner_corner_x right_eye_inner_corner_y
## 6564 6564
## right_eye_outer_corner_x right_eye_outer_corner_y
## 6564 6564
## left_eyebrow_inner_end_x left_eyebrow_inner_end_y
## 6562 6562
## left_eyebrow_outer_end_x left_eyebrow_outer_end_y
## 6607 6607
## right_eyebrow_inner_end_x right_eyebrow_inner_end_y
## 6562 6562
## right_eyebrow_outer_end_x right_eyebrow_outer_end_y
## 6596 6596
## nose_tip_x nose_tip_y
## 1783 1783
## mouth_left_corner_x mouth_left_corner_y
## 6563 6563
## mouth_right_corner_x mouth_right_corner_y
## 6562 6562
## mouth_center_top_lip_x mouth_center_top_lip_y
## 6557 6557
## mouth_center_bottom_lip_x mouth_center_bottom_lip_y
## 1816 1816
## [1] "numeric data w/ 0s in glbObsAll: "
## named integer(0)
## [1] "numeric data w/ Infs in glbObsAll: "
## named integer(0)
## [1] "numeric data w/ NaNs in glbObsAll: "
## named integer(0)
## [1] "string data missing in glbObsAll: "
## Image ImageId Image.pxl.1.dgt.1
## 0 0 0
## label step_major step_minor label_minor bgn end elapsed
## 3 scrub.data 2 1 1 84.806 86.384 1.578
## 4 transform.data 2 2 2 86.384 NA NA
2.2: transform data## label step_major step_minor label_minor bgn end elapsed
## 4 transform.data 2 2 2 86.384 86.426 0.042
## 5 extract.features 3 0 0 86.427 NA NA
3.0: extract features## label step_major step_minor label_minor bgn
## 5 extract.features 3 0 0 86.427
## 6 extract.features.datetime 3 1 1 86.448
## end elapsed
## 5 86.448 0.021
## 6 NA NA
3.1: extract features datetime## label step_major step_minor label_minor bgn
## 1 extract.features.datetime.bgn 1 0 0 86.475
## end elapsed
## 1 NA NA
## label step_major step_minor label_minor bgn
## 6 extract.features.datetime 3 1 1 86.448
## 7 extract.features.image 3 2 2 86.484
## end elapsed
## 6 86.484 0.036
## 7 NA NA
3.2: extract features imageextract.features.image.chunk.df <- myadd_chunk(NULL, "extract.features.image.bgn")
## label step_major step_minor label_minor bgn end
## 1 extract.features.image.bgn 1 0 0 86.515 NA
## elapsed
## 1 NA
if (length(names(glbFeatsImage)) > 0) {
for (feat in names(glbFeatsImage)) {
extract.features.image.chunk.df <-
myadd_chunk(extract.features.image.chunk.df, paste0("extract.features.image", ".", feat, ".bgn"),
major.inc = TRUE)
glbFeatsImage[[feat]]$imgMtrx <-
foreach(obsIx = 1:nrow(glbObsAll), .combine = rbind) %dopar% {
# foreach(obsIx = 1:5, .combine = rbind) %dopar% {
as.integer(unlist(strsplit(glbObsAll[obsIx, feat], " ")))
}
mygetImage <- function(obsIx, featImage, plot = FALSE,
ptFeat = NULL, ptQry = NULL, patch = NULL) {
img <- matrix(data = glbFeatsImage[[featImage]]$imgMtrx[obsIx, ],
nrow = 96, ncol = 96)
if (plot) {
require(reshape2)
imgRev <- matrix(data = rev(glbFeatsImage[[featImage]]$imgMtrx[obsIx, ]),
nrow = 96, ncol = 96)
mltd <- melt(imgRev); names(mltd) <- c("x", "y", "value")
# print(img[1:6,1])
# print(head(mltd))
gp <- ggplot(mltd, aes(x = x, y = y)) + geom_raster(aes(fill = value)) +
scale_fill_gradient(low = "#000000", high = "#FFFFFF")
if (!is.null(ptFeat)) {
gp <- gp + geom_point(data = data.frame(
x = 96 - ptFeat["x"], y = 96 - ptFeat["y"]),
mapping = aes(x = x, y = y), color = "blue", shape = 10, size = 5)
}
if (!is.null(ptQry)) {
if (!is.data.frame(ptQry)) ptQry <- as.data.frame(ptQry)
gp <- gp + geom_point(data = ptQry,
mapping = aes(x = 96 - x, y = 96 - y), color = "red", shape = 13, size = 5)
# gp <- gp + geom_point(data = data.frame(
# x = 96 - ptQry["x"], y = 96 - ptQry["y"]),
# mapping = aes(x = x, y = y), color = "red", shape = 13, size = 5)
}
if (!is.null(patch)) {
gp <- gp + geom_rect(data = data.frame(
xmin = 96 - patch["xmin"], xmax = 96 - patch["xmax"],
ymin = 96 - patch["ymin"], ymax = 96 - patch["ymax"]),
mapping = aes(xmin = xmin, ymin = ymin, xmax = xmax, ymax = ymax),
color = "green", inherit.aes = FALSE, fill = NA)
}
print(gp)
}
return(img)
}
#obsIx=1; jnk <- mygetImage(obsIx, feat, plot = TRUE, ptFeat = c(x = glbObsAll[obsIx, "left_eye_center_x"], y = glbObsAll[obsIx, "left_eye_center_y"]), ptQry = c(x = 6, y = 6), patch = c(xmin = 4, xmax = 8, ymin = 4, ymax = 8))
extract.features.image.chunk.df <- myadd_chunk(extract.features.image.chunk.df,
paste0("extract.features.image", ".", feat, ".display"),
major.inc = TRUE)
for (src in unique(glbObsAll$.src)) {
print(sprintf(" Sample images from %s:%s", src, feat))
# glbObsAll is sorted by .src rather than .pos
# obsIx=sample(glbObsAll[glbObsAll$.src == "Train", ".pos"], 1)
set.seed(lclImageSampleSeed)
smpl <- 1:nrow(glbObsAll)
smpl <- sample(smpl[glbObsAll$.src == src], 5)
for (obsIx in smpl) {
print(sprintf(" obsIx:%d:", obsIx))
print(glbObsAll[obsIx, glb_dsp_cols])
ptFeat <- c(x = glbObsAll[obsIx, "left_eye_center_x"],
y = glbObsAll[obsIx, "left_eye_center_y"])
patch <- c(xmin = glbObsAll[obsIx, "left_eye_center_x"] - 2,
xmax = glbObsAll[obsIx, "left_eye_center_x"] + 2,
ymin = glbObsAll[obsIx, "left_eye_center_y"] - 2,
ymax = glbObsAll[obsIx, "left_eye_center_y"] + 2)
jnk <- mygetImage(obsIx, feat, plot = TRUE, ptFeat = ptFeat, patch = patch)
# jnk <- mygetImage(obsIx, feat, plot = TRUE, featHighlight = c("left_eye_center_x", "left_eye_center_y"), patchHighlight = c(xmin=66-2, xmax=66+2, ymin=38-2, ymax=38+2))
}
}
# Data fixes
glbObsFix <- c("Train#1878","Train#1908")
for (id in glbObsFix) {
print(sprintf("Fixing data for image: %s: obsIx:%d", id,
obsIx <- which(glbObsAll[, glbFeatsId] == id)))
jnk <- mygetImage(obsIx, feat, plot = TRUE,
ptFeat = c(x = glbObsAll[obsIx, "left_eye_center_x"],
y = glbObsAll[obsIx, "left_eye_center_y"]))
glbObsAll[obsIx, c("left_eye_center_x", "left_eye_center_y")] <- NA
}
if (!is.null(glbFeatsImage[[feat]]$patchSize)) {
extract.features.image.chunk.df <- myadd_chunk(extract.features.image.chunk.df,
paste0("extract.features.image", ".", feat, ".patch.mean"),
major.inc = TRUE)
#crdX <- paste(glb_rsp_var, "x", sep = "_") glb_rsp_var == "left_eye_center_x"
crdX <- paste("left_eye_center", "x", sep = "_")
crdY <- paste("left_eye_center", "y", sep = "_")
patchSize <- glbFeatsImage[[feat]]$patchSize
patches <- foreach(obsIx = c(1:nrow(glbObsAll))[glbObsAll$.src == "Train"],
# patches <- foreach(obsIx = 1:7039,
.combine = rbind) %dopar% {
img <- mygetImage(obsIx, feat)
# img <- matrix(data = glbFeatsImage[[feat]]$imgMtrx[obsIx, ], nrow = 96, ncol = 96)
ptFeat <- c(x = glbObsAll[obsIx, crdX], y = glbObsAll[obsIx, crdY])
patch <- c(xmin = glbObsAll[obsIx, crdX] - patchSize,
xmax = glbObsAll[obsIx, crdX] + patchSize,
ymin = glbObsAll[obsIx, crdY] - patchSize,
ymax = glbObsAll[obsIx, crdY] + patchSize)
if ((!is.na(ptFeat["x"])) && (!is.na(ptFeat["y"])) &&
(patch["xmin"] >= 1) && (patch["xmax"] <= 96) &&
(patch["ymin"] >= 1) && (patch["ymax"] <= 96)) {
result <- as.vector(img[patch["xmin"]:patch["xmax"], patch["ymin"]:patch["ymax"]])
} else {
result <- NULL
}
result <- result
}
patchEdgeLength = 2 * glbFeatsImage[[feat]]$patchSize + 1
glbFeatsImage[[feat]]$meanPatchMtrx <-
matrix(data = colMeans(patches), nrow = patchEdgeLength, ncol = patchEdgeLength)
print(sprintf(" Mean patch (size = %d) for %s:%s",
glbFeatsImage[[feat]]$patchSize, feat, "left_eye_center"))
image(1:patchEdgeLength, 1:patchEdgeLength,
glbFeatsImage[[feat]]$meanPatchMtrx[patchEdgeLength:1, patchEdgeLength:1],
col = gray((0:255) / 255))
rm(patches)
extract.features.image.chunk.df <- myadd_chunk(extract.features.image.chunk.df,
paste0("extract.features.image", ".", feat, ".patch.search"),
major.inc = TRUE)
searchSize <- 2
# Compute, Inspect & Output TrnPatches
mygetImageSearchSpace <- function(searchCenter, searchSize) {
x1 <- as.integer(searchCenter["x"]) - searchSize
x2 <- as.integer(searchCenter["x"]) + searchSize
y1 <- as.integer(searchCenter["y"]) - searchSize
y2 <- as.integer(searchCenter["y"]) + searchSize
return(searchSpace <- expand.grid(x = x1:x2, y = y1:y2))
}
#print(mygetImageSearchSpace(searchCenter = c(x = glbObsAll[obsIx, "left_eye_center_x"], y = glbObsAll[obsIx, "left_eye_center_y"]), searchSize = 2))
meanXY <- c(x = mean(glbObsAll[glbObsAll$.src == "Train", crdX], na.rm = TRUE),
y = mean(glbObsAll[glbObsAll$.src == "Train", crdY], na.rm = TRUE))
require(proxy)
tmpObsAll <- glbObsAll[, c(glbFeatsId, "left_eye_center_x", "left_eye_center_y")]
# dplyr::mutate(left_eye_center_x.int = as.integer(left_eye_center_x),
# left_eye_center_y.int = as.integer(left_eye_center_y)) %>%
# dplyr::select(ImageId, left_eye_center_x.int, left_eye_center_y.int)
startTm <- proc.time()["elapsed"]
outObs <-
foreach(obsIx = c(1:nrow(glbObsAll)),
# foreach(obsIx = sample(c(1:nrow(glbObsAll))[glbObsAll$.src == "Train"], 5),
# foreach(obsIx = sample(c(1:nrow(glbObsAll))[glbObsAll$.src == "Test" ], 5),
# obsIx=sample(glbObsAll[glbObsAll$.src == "Train", ".pos"], 1)
.combine = rbind) %dopar% {
img <- mygetImage(obsIx, feat)
patchSize <- glbFeatsImage[[feat]]$patchSize
lblXY <- c(x = as.integer(glbObsAll[obsIx, "left_eye_center_x"]),
y = as.integer(glbObsAll[obsIx, "left_eye_center_y"]))
#jnk <- mygetImage(obsIx, feat, plot = TRUE, ptFeat = c(x = glbObsAll[obsIx, "left_eye_center_x"], y = glbObsAll[obsIx, "left_eye_center_y"]))
if (is.na(lblXY["x"]) || is.na(lblXY["y"]))
searchSpace <- mygetImageSearchSpace(searchCenter = meanXY,
searchSize = searchSize) else
searchSpace <- mygetImageSearchSpace(searchCenter = lblXY,
searchSize = searchSize)
thsObs <- foreach(j = 1:nrow(searchSpace), .combine = rbind) %dopar% {
x <- searchSpace$x[j]
y <- searchSpace$y[j]
p <- img[max( 1, (x - patchSize)):min(96, (x + patchSize)),
max( 1, (y - patchSize)):min(96, (y + patchSize))]
if (length(as.vector(p)) !=
length(as.vector(glbFeatsImage[[feat]]$meanPatchMtrx)))
thsPatch <- NULL else {
P.cor <- cor(as.vector(p),
as.vector(glbFeatsImage[[feat]]$meanPatchMtrx))
P.mnkSml.1 <- as.vector(proxy::simil(
rbind(as.vector(p),
as.vector(glbFeatsImage[[feat]]$meanPatchMtrx)),
method = "Minkowski", p = 1))
if (is.na(lblXY["x"]) || is.na(lblXY["y"]))
label <- ".none" else
if ((x != lblXY["x"]) || (y != lblXY["y"]))
label <- ".none" else
label <- "left_eye_center"
thsPatch <- data.frame(x, y, P.cor, P.mnkSml.1, label)
# Ensure all metrics are similarity oriented (vs. distance)
}
thsPatch
}
if (is.null(thsObs)) thsObsOut <- NULL else
thsObsOut <- cbind(tmpObsAll[obsIx, ], thsObs)
#bstIx <- thsObsOut[which.max(thsObsOut$patch.cor), ]; jnk <- mygetImage(obsIx, feat, plot = TRUE, ptFeat = c(x = glbObsAll[obsIx, "left_eye_center_x"], y = glbObsAll[obsIx, "left_eye_center_y"]), ptQry = c(x = bstIx$x, y = bstIx$y))
}
print(sprintf("Elapsed time: %f secs", proc.time()["elapsed"] - startTm))
outObsTrn <-
outObs[outObs[, glbFeatsId] %in% glbObsAll[glbObsAll$.src == "Train", glbFeatsId], ]
outObsNew <-
outObs[outObs[, glbFeatsId] %in% glbObsAll[glbObsAll$.src == "Test" , glbFeatsId], ]
# stop("here"); save.image(file = paste0(glbOut$pfx, "tmp.RData"))
metrics <- grep("^P\\.", names(outObsTrn), value = TRUE)
print(sprintf("outObsTrn Distribution:"))
smmry <- lapply(metrics, function(metric) {
tapply(outObsTrn[, metric], outObsTrn[, "label"], summary) })
names(smmry) <- metrics; print(smmry)
print(myplot_violin(outObsTrn, metrics, xcol_name = "label") +
#facet_wrap(~object, scales = "free_y") +
ggtitle(sprintf("%s:%s:outObsTrn", feat, "left_eye_center")))
# print(myplot_histogram(outObsTrn, metric) + facet_wrap(~label, scales = "free_y"))
# ggtitle(sprintf("%s:%s:outObsTrn\n%s", feat, "left_eye_center",
# paste(names(summary(outObsTrn[, metric])),
# summary(outObsTrn[outObsTrn$label == "left_eye_center", metric]),
# sep = ":", collapse = "; "))))
require(lazyeval)
# Plot images with is.na(glbObsTrn$ptFeat) in outObsTrn
tmpObsTrn <-
glbObsAll[glbObsAll$.src == "Train",
c(glbFeatsId, "left_eye_center_x", "left_eye_center_y")] %>%
dplyr::filter(is.na(left_eye_center_x))
selObsTrn <-
merge(tmpObsTrn[, glbFeatsId, FALSE], outObsTrn,
by.x = c(glbFeatsId),
by.y = c(glbFeatsId),
all.x = TRUE) %>%
dplyr::group_by(ImageId)
bstObsTrn <- foreach(metric = metrics, .combine = rbind) %do%
dplyr::filter_(selObsTrn, interp(~(min_rank(desc(var)) <= 1), var = as.name(metric)))
if (unique(selObsTrn$label) %in% c("left_eye_center"))
stop("this should not happen")
print(sprintf("Sample Images (%d of %d) of is.na(%s.left_eye_center)",
min(10, nrow(tmpObsTrn)), nrow(tmpObsTrn), feat))
for (id in sort(sample(tmpObsTrn[, glbFeatsId], 10))) {
thsObsTrn <- bstObsTrn[bstObsTrn[, glbFeatsId] == id, ]
print(thsObsTrn)
jnk <- mygetImage(obsIx = which(glbObsAll[, glbFeatsId] == id), featImage = feat,
plot = TRUE,
ptQry = data.frame(x = thsObsTrn$x, y = thsObsTrn$y))
}
for (metric in metrics) {
selObsTrn <- subset(outObsTrn,
(x == left_eye_center_x) & (y == left_eye_center_y)) %>%
dplyr::arrange_(metric)
# Plot images with least & most 10 metric for detection point in outObsTrn
for (type in c("min", "max")) {
print(sprintf("Sample Images of %s(%s.left_eye_center.%s)", type, feat, metric))
for (id in switch(type,
"min" = head(selObsTrn, 10)[, glbFeatsId],
"max" = tail(selObsTrn, 10)[, glbFeatsId])) {
thsObsTrn <- selObsTrn[selObsTrn[, glbFeatsId] == id, ]
print(thsObsTrn)
bstObsTrn <- foreach(metric2 = metrics, .combine = rbind) %do% {
outObsTrn[outObsTrn[, glbFeatsId] == id, ] %>%
dplyr::filter_(interp(~(min_rank(desc(var)) <= 1), var = as.name(metric2)))
}
print(bstObsTrn)
jnk <- mygetImage(obsIx = which(glbObsAll[, glbFeatsId] == id), featImage = feat,
plot = TRUE,
ptFeat = c(x = thsObsTrn[, "left_eye_center_x"],
y = thsObsTrn[, "left_eye_center_y"]),
ptQry = bstObsTrn[, c("x", "y")])
}
}
}
write.csv(outObsTrn, paste0(glbOut$pfx, "Train.csv"), row.names = FALSE)
print(sprintf("outObsNew Distribution:"))
smmry <- lapply(metrics, function(metric) {
tapply(outObsNew[, metric], outObsNew[, "label"], summary) })
names(smmry) <- metrics; print(smmry)
print(myplot_violin(outObsNew, metrics, xcol_name = "label") +
ggtitle(sprintf("%s:%s:outObsNew", feat, "left_eye_center")))
require(lazyeval)
for (metric in metrics) {
selObsNew <- dplyr::group_by(outObsNew, ImageId) %>%
dplyr::filter_(interp(~(min_rank(desc(var)) <= 1), var = as.name(metric))) %>%
dplyr::ungroup() %>%
dplyr::arrange_(metric) %>%
as.data.frame()
# Plot images with least & most 10 metric for detection point in outObsNew
for (type in c("min", "max")) {
print(sprintf("Sample Images of %s(%s.left_eye_center.%s)",
type, feat, metric))
for (id in switch(type,
"min" = head(selObsNew, 10)[, glbFeatsId, TRUE],
"max" = tail(selObsNew, 10)[, glbFeatsId, TRUE])) {
thsObsNew <- selObsNew[selObsNew[, glbFeatsId] == id, ]
print(thsObsNew)
bstObsNew <- foreach(metric2 = metrics, .combine = rbind) %do% {
outObsNew[outObsNew[, glbFeatsId] == id, ] %>%
dplyr::filter_(interp(~(min_rank(desc(var)) <= 1),
var = as.name(metric2)))
}
print(bstObsNew)
jnk <- mygetImage(obsIx = which(glbObsAll[, glbFeatsId] == id),
featImage = feat,
plot = TRUE,
ptFeat = c(x = thsObsNew[, "left_eye_center_x"],
y = thsObsNew[, "left_eye_center_y"]),
ptQry = bstObsNew[, c("x", "y")])
}
}
}
write.csv(outObsNew[, setdiff(names(outObsNew),
c("left_eye_center_x", "left_eye_center_y"))],
paste0(glbOut$pfx, "Test.csv"), row.names = FALSE)
}
extract.features.image.chunk.df <- myadd_chunk(extract.features.image.chunk.df,
paste0("extract.features.image", ".", feat, ".end"),
major.inc = TRUE)
}
glbObsAll <- glbObsAll[, setdiff(names(glbObsAll), names(glbFeatsImage))]
glbFeatsExclude <- union(glbFeatsExclude, names(glbFeatsImage))
}
## label step_major step_minor label_minor
## 1 extract.features.image.bgn 1 0 0
## 2 extract.features.image.Image.bgn 2 0 0
## bgn end elapsed
## 1 86.515 86.523 0.008
## 2 86.523 NA NA
## label step_major step_minor label_minor
## 2 extract.features.image.Image.bgn 2 0 0
## 3 extract.features.image.Image.display 3 0 0
## bgn end elapsed
## 2 86.523 238.84 152.318
## 3 238.841 NA NA
## [1] " Sample images from Train:Image"
## [1] " obsIx:925:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 925 925 Train#0925 5 925 65
## left_eye_center_y
## 925 37
## Loading required package: reshape2
## [1] " obsIx:2219:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 2219 2219 Train#2219 2 2219 65
## left_eye_center_y
## 2219 34
## [1] " obsIx:1136:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 1136 1136 Train#1136 1 1136 62
## left_eye_center_y
## 1136 35
## [1] " obsIx:2195:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 2195 2195 Train#2195 1 2195 62
## left_eye_center_y
## 2195 35
## [1] " obsIx:597:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 597 597 Train#0597 5 597 68
## left_eye_center_y
## 597 33
## [1] " Sample images from Test:Image"
## [1] " obsIx:7283:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 7283 7283 Test#0234 2 7283 NA
## left_eye_center_y
## 7283 NA
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_rect).
## [1] " obsIx:7610:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 7610 7610 Test#0561 1 7610 NA
## left_eye_center_y
## 7610 NA
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_rect).
## [1] " obsIx:7337:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 7337 7337 Test#0288 5 7337 NA
## left_eye_center_y
## 7337 NA
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_rect).
## [1] " obsIx:7604:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 7604 7604 Test#0555 5 7604 NA
## left_eye_center_y
## 7604 NA
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_rect).
## [1] " obsIx:7200:"
## .pos ImageId Image.pxl.1.dgt.1 .pos.y left_eye_center_x
## 7200 7200 Test#0151 1 7200 NA
## left_eye_center_y
## 7200 NA
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_rect).
## [1] "Fixing data for image: Train#1878: obsIx:1878"
## [1] "Fixing data for image: Train#1908: obsIx:1908"
## label step_major step_minor
## 3 extract.features.image.Image.display 3 0
## 4 extract.features.image.Image.patch.mean 4 0
## label_minor bgn end elapsed
## 3 0 238.841 245.264 6.423
## 4 0 245.265 NA NA
## [1] " Mean patch (size = 10) for Image:left_eye_center"
## label step_major step_minor
## 4 extract.features.image.Image.patch.mean 4 0
## 5 extract.features.image.Image.patch.search 5 0
## label_minor bgn end elapsed
## 4 0 245.265 254.275 9.01
## 5 0 254.276 NA NA
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## [1] "Elapsed time: 683.716000 secs"
## [1] "outObsTrn Distribution:"
## $P.cor
## $P.cor$.none
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.7128 0.3942 0.5382 0.5095 0.6536 0.9398
##
## $P.cor$left_eye_center
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.6360 0.4913 0.6164 0.5824 0.7104 0.9270
##
##
## $P.mnkSml.1
## $P.mnkSml.1$.none
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.836e-05 4.733e-05 6.490e-05 7.011e-05 8.718e-05 2.810e-04
##
## $P.mnkSml.1$left_eye_center
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.848e-05 4.841e-05 6.732e-05 7.317e-05 9.156e-05 2.563e-04
## Warning in myplot_violin(outObsTrn, metrics, xcol_name = "label"):
## xcol_name:label is not a factor; creating label_fctr
## [1] "Sample Images (10 of 12) of is.na(Image.left_eye_center)"
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#1688 NA NA 63 39 0.1180514
## 2 Train#1688 NA NA 63 39 0.1180514
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#1835 NA NA 67 37 0.2237563
## 2 Train#1835 NA NA 63 39 0.1877452
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#1878 NA NA 65 38 0.7494728
## 2 Train#1878 NA NA 67 37 0.7066856
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#1939 NA NA 63 39 0.287143038
## 2 Train#1939 NA NA 63 35 -0.007227532
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#2101 NA NA 63 38 0.3205384
## 2 Train#2101 NA NA 63 35 0.1943326
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#2138 NA NA 63 39 0.6088198
## 2 Train#2138 NA NA 67 39 0.5532149
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#2154 NA NA 67 39 0.6202204
## 2 Train#2154 NA NA 63 39 0.5271037
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#2176 NA NA 67 39 0.3033746
## 2 Train#2176 NA NA 67 37 0.1809314
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#2187 NA NA 65 35 0.7908460
## 2 Train#2187 NA NA 67 39 0.5887222
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## Source: local data frame [2 x 8]
## Groups: ImageId [1]
##
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## (chr) (int) (int) (int) (int) (dbl)
## 1 Train#2240 NA NA 65 39 0.5363874
## 2 Train#2240 NA NA 67 39 0.4990001
## Variables not shown: P.mnkSml.1 (dbl), label (chr)
## [1] "Sample Images of min(Image.left_eye_center.P.cor)"
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#1456 72 28 72 28 -0.6360407
## P.mnkSml.1 label
## 1 3.977916e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#1456 72 28 70 30 -0.3776912
## 2 Train#1456 72 28 70 30 -0.3776912
## P.mnkSml.1 label
## 1 4.294351e-05 .none
## 2 4.294351e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 2 Train#1549 78 47 78 47 -0.5489289
## P.mnkSml.1 label
## 2 4.21978e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#1549 78 47 80 45 -0.3571021
## 2 Train#1549 78 47 80 49 -0.3860298
## P.mnkSml.1 label
## 1 4.236846e-05 .none
## 2 4.752634e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 3 Train#2765 63 31 63 31 -0.3139753
## P.mnkSml.1 label
## 3 3.888173e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#2765 63 31 61 33 -0.2454069
## 2 Train#2765 63 31 62 33 -0.2486746
## P.mnkSml.1 label
## 1 4.054103e-05 .none
## 2 4.080954e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 4 Train#5445 65 37 65 37 -0.284615
## P.mnkSml.1 label
## 4 8.220857e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#5445 65 37 67 39 -0.1157250
## 2 Train#5445 65 37 65 39 -0.2177243
## P.mnkSml.1 label
## 1 8.955893e-05 .none
## 2 9.039222e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 5 Train#2830 74 34 74 34 -0.2468946
## P.mnkSml.1 label
## 5 5.175331e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#2830 74 34 76 32 0.2249593
## 2 Train#2830 74 34 72 32 0.1805403
## P.mnkSml.1 label
## 1 5.245942e-05 .none
## 2 6.068728e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 6 Train#4524 61 30 61 30 -0.2393224
## P.mnkSml.1 label
## 6 7.804373e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#4524 61 30 63 28 0.1081624
## 2 Train#4524 61 30 63 28 0.1081624
## P.mnkSml.1 label
## 1 8.171742e-05 .none
## 2 8.171742e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7 Train#3505 65 31 65 31 -0.2345882
## P.mnkSml.1 label
## 7 6.071794e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#3505 65 31 67 33 0.09625468
## 2 Train#3505 65 31 67 33 0.09625468
## P.mnkSml.1 label
## 1 7.44077e-05 .none
## 2 7.44077e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 8 Train#6939 65 31 65 31 -0.2345882
## P.mnkSml.1 label
## 8 6.071794e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6939 65 31 67 33 0.09625468
## 2 Train#6939 65 31 67 33 0.09625468
## P.mnkSml.1 label
## 1 7.44077e-05 .none
## 2 7.44077e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 9 Train#1389 63 37 63 37 -0.2069886
## P.mnkSml.1 label
## 9 5.587279e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#1389 63 37 65 39 0.09480039
## 2 Train#1389 63 37 65 39 0.09480039
## P.mnkSml.1 label
## 1 7.395032e-05 .none
## 2 7.395032e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 10 Train#3320 69 41 69 41 -0.1608725
## P.mnkSml.1 label
## 10 5.349213e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#3320 69 41 67 39 -0.02843444
## 2 Train#3320 69 41 71 43 -0.25619541
## P.mnkSml.1 label
## 1 5.097377e-05 .none
## 2 5.589428e-05 .none
## [1] "Sample Images of max(Image.left_eye_center.P.cor)"
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7023 Train#0327 64 38 64 38 0.9050761
## P.mnkSml.1 label
## 7023 0.0001520374 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#0327 64 38 64 38 0.9050761
## 2 Train#0327 64 38 63 38 0.8792521
## P.mnkSml.1 label
## 1 0.0001520374 left_eye_center
## 2 0.0001550048 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7024 Train#6897 62 38 62 38 0.9062735
## P.mnkSml.1 label
## 7024 0.0001409013 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6897 62 38 62 39 0.9104232
## 2 Train#6897 62 38 63 38 0.8933328
## P.mnkSml.1 label
## 1 0.0001238780 .none
## 2 0.0001416389 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7025 Train#5608 65 40 65 40 0.9088795
## P.mnkSml.1 label
## 7025 0.0001033814 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#5608 65 40 64 40 0.9158048
## 2 Train#5608 65 40 63 41 0.8754833
## P.mnkSml.1 label
## 1 0.0001038099 .none
## 2 0.0001104939 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7026 Train#3192 65 39 65 39 0.9097553
## P.mnkSml.1 label
## 7026 9.770869e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#3192 65 39 65 38 0.9224496
## 2 Train#3192 65 39 65 37 0.8404357
## P.mnkSml.1 label
## 1 0.0001075934 .none
## 2 0.0001078265 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7027 Train#5075 67 40 67 40 0.9127174
## P.mnkSml.1 label
## 7027 5.412629e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#5075 67 40 66 40 0.9203489
## 2 Train#5075 67 40 69 38 0.6464310
## P.mnkSml.1 label
## 1 5.400742e-05 .none
## 2 5.539501e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7028 Train#6943 64 36 64 36 0.913238
## P.mnkSml.1 label
## 7028 8.881756e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6943 64 36 64 36 0.9132380
## 2 Train#6943 64 36 66 36 0.8986056
## P.mnkSml.1 label
## 1 8.881756e-05 left_eye_center
## 2 9.218915e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7029 Train#3529 65 39 65 39 0.9157342
## P.mnkSml.1 label
## 7029 0.0002424008 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#3529 65 39 65 39 0.9157342
## 2 Train#3529 65 39 65 39 0.9157342
## P.mnkSml.1 label
## 1 0.0002424008 left_eye_center
## 2 0.0002424008 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7030 Train#6059 68 41 68 41 0.9207783
## P.mnkSml.1 label
## 7030 0.0001787562 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6059 68 41 67 41 0.9233954
## 2 Train#6059 68 41 67 41 0.9233954
## P.mnkSml.1 label
## 1 0.000185896 .none
## 2 0.000185896 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7031 Train#5178 67 40 67 40 0.9212727
## P.mnkSml.1 label
## 7031 6.976744e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#5178 67 40 66 40 0.9296789
## 2 Train#5178 67 40 69 42 0.7815534
## P.mnkSml.1 label
## 1 6.951041e-05 .none
## 2 8.386552e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7032 Train#5882 66 38 66 38 0.9269978
## P.mnkSml.1 label
## 7032 0.0001422099 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#5882 66 38 66 38 0.9269978
## 2 Train#5882 66 38 66 37 0.9218836
## P.mnkSml.1 label
## 1 0.0001422099 left_eye_center
## 2 0.0001589986 .none
## [1] "Sample Images of min(Image.left_eye_center.P.mnkSml.1)"
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6787 62 35 62 35 0.634311
## P.mnkSml.1 label
## 1 1.848065e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6787 62 35 62 36 0.6773129
## 2 Train#6787 62 35 64 37 0.5841722
## P.mnkSml.1 label
## 1 1.850972e-05 .none
## 2 1.858886e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 2 Train#4647 70 40 70 40 0.381879
## P.mnkSml.1 label
## 2 2.030413e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#4647 70 40 68 42 0.60330462
## 2 Train#4647 70 40 68 38 0.08690398
## P.mnkSml.1 label
## 1 2.033402e-05 .none
## 2 2.063196e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 3 Train#6329 69 41 69 41 0.541711
## P.mnkSml.1 label
## 3 2.034553e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6329 69 41 67 42 0.6094017
## 2 Train#6329 69 41 67 39 0.3181454
## P.mnkSml.1 label
## 1 2.034229e-05 .none
## 2 2.042870e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 4 Train#3258 62 34 62 34 0.6265727
## P.mnkSml.1 label
## 4 2.097198e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#3258 62 34 61 33 0.6851609
## 2 Train#3258 62 34 64 32 0.4129192
## P.mnkSml.1 label
## 1 2.085216e-05 .none
## 2 2.264191e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 5 Train#7005 48 36 48 36 0.3874381
## P.mnkSml.1 label
## 5 2.113431e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#7005 48 36 49 37 0.4283352
## 2 Train#7005 48 36 50 34 0.3006479
## P.mnkSml.1 label
## 1 2.078322e-05 .none
## 2 2.364636e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 6 Train#3807 63 35 63 35 0.6891749
## P.mnkSml.1 label
## 6 2.149675e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#3807 63 35 62 35 0.7117636
## 2 Train#3807 63 35 63 33 0.4987480
## P.mnkSml.1 label
## 1 2.148659e-05 .none
## 2 2.159983e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7 Train#4114 67 36 67 36 0.5959682
## P.mnkSml.1 label
## 7 2.150969e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#4114 67 36 65 35 0.6394863
## 2 Train#4114 67 36 69 34 0.4789873
## P.mnkSml.1 label
## 1 2.140885e-05 .none
## 2 2.256113e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 8 Train#2628 63 33 63 33 0.5687351
## P.mnkSml.1 label
## 8 2.165315e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#2628 63 33 61 33 0.6851609
## 2 Train#2628 63 33 65 31 0.2239853
## P.mnkSml.1 label
## 1 2.085216e-05 .none
## 2 2.390892e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 9 Train#3468 66 38 66 38 0.6739389
## P.mnkSml.1 label
## 9 2.173882e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#3468 66 38 64 37 0.7733595
## 2 Train#3468 66 38 68 36 0.5468198
## P.mnkSml.1 label
## 1 2.174212e-05 .none
## 2 2.253030e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 10 Train#4862 64 33 64 33 0.2702696
## P.mnkSml.1 label
## 10 2.18927e-05 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#4862 64 33 62 35 0.3918556
## 2 Train#4862 64 33 62 35 0.3918556
## P.mnkSml.1 label
## 1 2.222255e-05 .none
## 2 2.222255e-05 .none
## [1] "Sample Images of max(Image.left_eye_center.P.mnkSml.1)"
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7023 Train#6690 66 39 66 39 0.7284469
## P.mnkSml.1 label
## 7023 0.000209768 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6690 66 39 68 40 0.7511544
## 2 Train#6690 66 39 67 40 0.7488077
## P.mnkSml.1 label
## 1 0.0002110004 .none
## 2 0.0002130583 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7024 Train#2825 67 30 67 30 0.8712119
## P.mnkSml.1 label
## 7024 0.0002205414 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#2825 67 30 67 30 0.8712119
## 2 Train#2825 67 30 67 29 0.8674864
## P.mnkSml.1 label
## 1 0.0002205414 left_eye_center
## 2 0.0002422138 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7025 Train#4366 67 36 67 36 0.822679
## P.mnkSml.1 label
## 7025 0.0002230579 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#4366 67 36 69 37 0.8592933
## 2 Train#4366 67 36 68 37 0.8549397
## P.mnkSml.1 label
## 1 0.0002364152 .none
## 2 0.0002392935 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7026 Train#3574 61 37 61 37 0.8298438
## P.mnkSml.1 label
## 7026 0.0002271903 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#3574 61 37 60 36 0.8797923
## 2 Train#3574 61 37 60 36 0.8797923
## P.mnkSml.1 label
## 1 0.0002809567 .none
## 2 0.0002809567 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7027 Train#1054 66 38 66 38 0.7508794
## P.mnkSml.1 label
## 7027 0.0002301253 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#1054 66 38 67 38 0.7560073
## 2 Train#1054 66 38 66 38 0.7508794
## P.mnkSml.1 label
## 1 0.0002301193 .none
## 2 0.0002301253 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7028 Train#6526 69 43 69 43 0.8300348
## P.mnkSml.1 label
## 7028 0.0002344257 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6526 69 43 68 43 0.8483222
## 2 Train#6526 69 43 68 43 0.8483222
## P.mnkSml.1 label
## 1 0.0002477668 .none
## 2 0.0002477668 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7029 Train#0001 66 39 66 39 0.7587717
## P.mnkSml.1 label
## 7029 0.0002382601 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#0001 66 39 66 38 0.7632100
## 2 Train#0001 66 39 66 39 0.7587717
## P.mnkSml.1 label
## 1 0.0002289619 .none
## 2 0.0002382601 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7030 Train#3529 65 39 65 39 0.9157342
## P.mnkSml.1 label
## 7030 0.0002424008 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#3529 65 39 65 39 0.9157342
## 2 Train#3529 65 39 65 39 0.9157342
## P.mnkSml.1 label
## 1 0.0002424008 left_eye_center
## 2 0.0002424008 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7031 Train#4936 67 37 67 37 0.8342052
## P.mnkSml.1 label
## 7031 0.0002441951 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#4936 67 37 66 37 0.8360293
## 2 Train#4936 67 37 67 37 0.8342052
## P.mnkSml.1 label
## 1 0.0002392332 .none
## 2 0.0002441951 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7032 Train#6640 64 41 64 41 0.8497793
## P.mnkSml.1 label
## 7032 0.0002562985 left_eye_center
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Train#6640 64 41 63 41 0.8588122
## 2 Train#6640 64 41 63 41 0.8588122
## P.mnkSml.1 label
## 1 0.0002611852 .none
## 2 0.0002611852 .none
## [1] "outObsNew Distribution:"
## $P.cor
## $P.cor$.none
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.6452 0.2737 0.4524 0.4225 0.5982 0.9417
##
##
## $P.mnkSml.1
## $P.mnkSml.1$.none
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.889e-05 4.527e-05 6.213e-05 6.759e-05 8.439e-05 2.429e-04
## Warning in myplot_violin(outObsNew, metrics, xcol_name = "label"):
## xcol_name:label is not a factor; creating label_fctr
## [1] "Sample Images of min(Image.left_eye_center.P.cor)"
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0508 NA NA 63 39 -0.203006
## P.mnkSml.1 label
## 1 6.541984e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0508 NA NA 63 39 -0.2030060
## 2 Test#0508 NA NA 66 39 -0.2881322
## P.mnkSml.1 label
## 1 6.541984e-05 .none
## 2 6.623855e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 2 Test#0689 NA NA 67 39 -0.1180061
## P.mnkSml.1 label
## 2 4.901258e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0689 NA NA 67 39 -0.1180061
## 2 Test#0689 NA NA 63 39 -0.2021308
## P.mnkSml.1 label
## 1 4.901258e-05 .none
## 2 6.434960e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 3 Test#1716 NA NA 63 39 -0.07736262
## P.mnkSml.1 label
## 3 4.168925e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1716 NA NA 63 39 -0.07736262
## 2 Test#1716 NA NA 67 39 -0.19488784
## P.mnkSml.1 label
## 1 4.168925e-05 .none
## 2 4.535159e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 4 Test#0957 NA NA 63 39 -0.06631631
## P.mnkSml.1 label
## 4 6.590351e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0957 NA NA 63 39 -0.06631631
## 2 Test#0957 NA NA 63 39 -0.06631631
## P.mnkSml.1 label
## 1 6.590351e-05 .none
## 2 6.590351e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 5 Test#0786 NA NA 63 39 -0.05640868
## P.mnkSml.1 label
## 5 7.831467e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0786 NA NA 63 39 -0.05640868
## 2 Test#0786 NA NA 67 39 -0.18954377
## P.mnkSml.1 label
## 1 7.831467e-05 .none
## 2 8.036820e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 6 Test#0515 NA NA 63 39 -0.04330126
## P.mnkSml.1 label
## 6 6.237535e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0515 NA NA 63 39 -0.04330126
## 2 Test#0515 NA NA 63 35 -0.25193640
## P.mnkSml.1 label
## 1 6.237535e-05 .none
## 2 6.495704e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7 Test#1489 NA NA 66 39 -0.03896027
## P.mnkSml.1 label
## 7 3.260689e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1489 NA NA 66 39 -0.03896027
## 2 Test#1489 NA NA 67 39 -0.04544792
## P.mnkSml.1 label
## 1 3.260689e-05 .none
## 2 3.314917e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 8 Test#1616 NA NA 63 39 -0.01727762
## P.mnkSml.1 label
## 8 7.195424e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1616 NA NA 63 39 -0.01727762
## 2 Test#1616 NA NA 63 39 -0.01727762
## P.mnkSml.1 label
## 1 7.195424e-05 .none
## 2 7.195424e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 9 Test#0782 NA NA 67 36 -0.01268766
## P.mnkSml.1 label
## 9 2.7856e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0782 NA NA 67 36 -0.01268766
## 2 Test#0782 NA NA 63 36 -0.11683694
## P.mnkSml.1 label
## 1 2.785600e-05 .none
## 2 2.937606e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 10 Test#0922 NA NA 67 39 0.0002143385
## P.mnkSml.1 label
## 10 3.817048e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0922 NA NA 67 39 0.0002143385
## 2 Test#0922 NA NA 63 35 -0.1327946678
## P.mnkSml.1 label
## 1 3.817048e-05 .none
## 2 4.502816e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "Sample Images of max(Image.left_eye_center.P.cor)"
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1774 Test#1465 NA NA 65 35 0.8989472
## P.mnkSml.1 label
## 1774 6.073518e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1465 NA NA 65 35 0.8989472
## 2 Test#1465 NA NA 66 35 0.8971023
## P.mnkSml.1 label
## 1 6.073518e-05 .none
## 2 6.135766e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1775 Test#1484 NA NA 63 38 0.9015814
## P.mnkSml.1 label
## 1775 0.0002245134 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1484 NA NA 63 38 0.9015814
## 2 Test#1484 NA NA 63 38 0.9015814
## P.mnkSml.1 label
## 1 0.0002245134 .none
## 2 0.0002245134 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1776 Test#1481 NA NA 64 39 0.9123365
## P.mnkSml.1 label
## 1776 0.0001079984 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1481 NA NA 64 39 0.9123365
## 2 Test#1481 NA NA 63 39 0.8974781
## P.mnkSml.1 label
## 1 0.0001079984 .none
## 2 0.0001096531 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1777 Test#1104 NA NA 64 38 0.9145486
## P.mnkSml.1 label
## 1777 9.059217e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1104 NA NA 64 38 0.9145486
## 2 Test#1104 NA NA 65 37 0.9018643
## P.mnkSml.1 label
## 1 9.059217e-05 .none
## 2 9.629907e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1778 Test#1285 NA NA 66 38 0.9151713
## P.mnkSml.1 label
## 1778 8.09453e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1285 NA NA 66 38 0.9151713
## 2 Test#1285 NA NA 67 38 0.9097255
## P.mnkSml.1 label
## 1 8.094530e-05 .none
## 2 8.225353e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1779 Test#1037 NA NA 63 35 0.9231587
## P.mnkSml.1 label
## 1779 9.275495e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1037 NA NA 63 35 0.9231587
## 2 Test#1037 NA NA 63 35 0.9231587
## P.mnkSml.1 label
## 1 9.275495e-05 .none
## 2 9.275495e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1780 Test#1445 NA NA 66 39 0.923982
## P.mnkSml.1 label
## 1780 0.0002358088 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1445 NA NA 66 39 0.9239820
## 2 Test#1445 NA NA 67 39 0.9197258
## P.mnkSml.1 label
## 1 0.0002358088 .none
## 2 0.0002428970 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1781 Test#0802 NA NA 63 38 0.9398131
## P.mnkSml.1 label
## 1781 0.0001385619 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0802 NA NA 63 38 0.9398131
## 2 Test#0802 NA NA 63 38 0.9398131
## P.mnkSml.1 label
## 1 0.0001385619 .none
## 2 0.0001385619 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1782 Test#0912 NA NA 67 35 0.9417233
## P.mnkSml.1 label
## 1782 7.27685e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0912 NA NA 67 35 0.9417233
## 2 Test#0912 NA NA 67 35 0.9417233
## P.mnkSml.1 label
## 1 7.27685e-05 .none
## 2 7.27685e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1783 Test#1526 NA NA 67 35 0.9417233
## P.mnkSml.1 label
## 1783 7.27685e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1526 NA NA 67 35 0.9417233
## 2 Test#1526 NA NA 67 35 0.9417233
## P.mnkSml.1 label
## 1 7.27685e-05 .none
## 2 7.27685e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "Sample Images of min(Image.left_eye_center.P.mnkSml.1)"
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1289 NA NA 64 35 0.6909709
## P.mnkSml.1 label
## 1 1.979623e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1289 NA NA 63 35 0.6998731
## 2 Test#1289 NA NA 64 35 0.6909709
## P.mnkSml.1 label
## 1 1.976884e-05 .none
## 2 1.979623e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 2 Test#1169 NA NA 67 35 0.1109021
## P.mnkSml.1 label
## 2 1.998574e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1169 NA NA 65 39 0.7211337
## 2 Test#1169 NA NA 67 35 0.1109021
## P.mnkSml.1 label
## 1 1.934510e-05 .none
## 2 1.998574e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 3 Test#1009 NA NA 66 35 0.5917185
## P.mnkSml.1 label
## 3 2.223573e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1009 NA NA 63 35 0.7304313
## 2 Test#1009 NA NA 66 35 0.5917185
## P.mnkSml.1 label
## 1 2.218344e-05 .none
## 2 2.223573e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 4 Test#1008 NA NA 63 39 0.1484696
## P.mnkSml.1 label
## 4 2.240953e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1008 NA NA 67 39 0.2394538
## 2 Test#1008 NA NA 63 39 0.1484696
## P.mnkSml.1 label
## 1 2.231986e-05 .none
## 2 2.240953e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 5 Test#1337 NA NA 66 39 0.100492
## P.mnkSml.1 label
## 5 2.272633e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1337 NA NA 67 39 0.1125013
## 2 Test#1337 NA NA 66 39 0.1004920
## P.mnkSml.1 label
## 1 2.271633e-05 .none
## 2 2.272633e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 6 Test#0821 NA NA 67 38 0.1245349
## P.mnkSml.1 label
## 6 2.281195e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0821 NA NA 67 35 0.4010753
## 2 Test#0821 NA NA 67 38 0.1245349
## P.mnkSml.1 label
## 1 2.276470e-05 .none
## 2 2.281195e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 7 Test#0674 NA NA 67 35 0.06522551
## P.mnkSml.1 label
## 7 2.296003e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0674 NA NA 67 39 0.64216064
## 2 Test#0674 NA NA 67 35 0.06522551
## P.mnkSml.1 label
## 1 2.243176e-05 .none
## 2 2.296003e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 8 Test#1668 NA NA 67 38 0.4548755
## P.mnkSml.1 label
## 8 2.306557e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1668 NA NA 67 39 0.5458976
## 2 Test#1668 NA NA 67 38 0.4548755
## P.mnkSml.1 label
## 1 2.302415e-05 .none
## 2 2.306557e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 9 Test#1386 NA NA 63 39 0.5187169
## P.mnkSml.1 label
## 9 2.311854e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1386 NA NA 67 37 0.6036533
## 2 Test#1386 NA NA 63 39 0.5187169
## P.mnkSml.1 label
## 1 2.239174e-05 .none
## 2 2.311854e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 10 Test#0985 NA NA 67 35 0.04575563
## P.mnkSml.1 label
## 10 2.37353e-05 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0985 NA NA 63 39 0.46054064
## 2 Test#0985 NA NA 67 35 0.04575563
## P.mnkSml.1 label
## 1 2.199962e-05 .none
## 2 2.373530e-05 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## [1] "Sample Images of max(Image.left_eye_center.P.mnkSml.1)"
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1774 Test#0491 NA NA 65 38 0.7946929
## P.mnkSml.1 label
## 1774 0.0001964987 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0491 NA NA 67 38 0.8026069
## 2 Test#0491 NA NA 65 38 0.7946929
## P.mnkSml.1 label
## 1 0.0001820390 .none
## 2 0.0001964987 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1775 Test#0104 NA NA 65 38 0.7548987
## P.mnkSml.1 label
## 1775 0.0001981387 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0104 NA NA 66 38 0.7549165
## 2 Test#0104 NA NA 65 38 0.7548987
## P.mnkSml.1 label
## 1 0.0001973209 .none
## 2 0.0001981387 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1776 Test#0517 NA NA 64 35 0.8696209
## P.mnkSml.1 label
## 1776 0.0002001473 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0517 NA NA 64 35 0.8696209
## 2 Test#0517 NA NA 64 35 0.8696209
## P.mnkSml.1 label
## 1 0.0002001473 .none
## 2 0.0002001473 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1777 Test#1266 NA NA 67 36 0.7740234
## P.mnkSml.1 label
## 1777 0.0002060281 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1266 NA NA 67 36 0.7740234
## 2 Test#1266 NA NA 67 36 0.7740234
## P.mnkSml.1 label
## 1 0.0002060281 .none
## 2 0.0002060281 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1778 Test#1582 NA NA 64 35 0.7805427
## P.mnkSml.1 label
## 1778 0.0002083105 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1582 NA NA 64 35 0.7805427
## 2 Test#1582 NA NA 64 35 0.7805427
## P.mnkSml.1 label
## 1 0.0002083105 .none
## 2 0.0002083105 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1779 Test#1484 NA NA 63 38 0.9015814
## P.mnkSml.1 label
## 1779 0.0002245134 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1484 NA NA 63 38 0.9015814
## 2 Test#1484 NA NA 63 38 0.9015814
## P.mnkSml.1 label
## 1 0.0002245134 .none
## 2 0.0002245134 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1780 Test#1421 NA NA 67 36 0.8348689
## P.mnkSml.1 label
## 1780 0.0002324965 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1421 NA NA 67 36 0.8348689
## 2 Test#1421 NA NA 67 36 0.8348689
## P.mnkSml.1 label
## 1 0.0002324965 .none
## 2 0.0002324965 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1781 Test#0819 NA NA 65 35 0.8584465
## P.mnkSml.1 label
## 1781 0.000233622 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#0819 NA NA 65 35 0.8584465
## 2 Test#0819 NA NA 65 35 0.8584465
## P.mnkSml.1 label
## 1 0.000233622 .none
## 2 0.000233622 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1782 Test#1149 NA NA 67 37 0.8271626
## P.mnkSml.1 label
## 1782 0.0002412474 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1149 NA NA 67 37 0.8271626
## 2 Test#1149 NA NA 67 37 0.8271626
## P.mnkSml.1 label
## 1 0.0002412474 .none
## 2 0.0002412474 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1783 Test#1445 NA NA 67 39 0.9197258
## P.mnkSml.1 label
## 1783 0.000242897 .none
## ImageId left_eye_center_x left_eye_center_y x y P.cor
## 1 Test#1445 NA NA 66 39 0.9239820
## 2 Test#1445 NA NA 67 39 0.9197258
## P.mnkSml.1 label
## 1 0.0002358088 .none
## 2 0.0002428970 .none
## Warning: Removed 1 rows containing missing values (geom_point).
## label step_major step_minor
## 5 extract.features.image.Image.patch.search 5 0
## 6 extract.features.image.Image.end 6 0
## label_minor bgn end elapsed
## 5 0 254.276 996.627 742.352
## 6 0 996.628 NA NA
myadd_chunk(extract.features.image.chunk.df, paste0("extract.features.image", ".end"),
major.inc = TRUE)
## label step_major step_minor label_minor
## 6 extract.features.image.Image.end 6 0 0
## 7 extract.features.image.end 7 0 0
## bgn end elapsed
## 6 996.628 996.668 0.04
## 7 996.669 NA NA
## label step_major step_minor
## 1 extract.features.image.bgn 1 0
## 2 extract.features.image.Image.bgn 2 0
## 3 extract.features.image.Image.display 3 0
## 4 extract.features.image.Image.patch.mean 4 0
## 5 extract.features.image.Image.patch.search 5 0
## 6 extract.features.image.Image.end 6 0
## 7 extract.features.image.end 7 0
## label_minor bgn end elapsed
## 1 0 86.515 86.523 0.008
## 2 0 86.523 238.840 152.318
## 3 0 238.841 245.264 6.423
## 4 0 245.265 254.275 9.010
## 5 0 254.276 996.627 742.352
## 6 0 996.628 996.668 0.040
## 7 0 996.669 NA NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "extract.features.price", major.inc = FALSE)
## label step_major step_minor label_minor bgn end
## 7 extract.features.image 3 2 2 86.484 996.684
## 8 extract.features.price 3 3 3 996.684 NA
## elapsed
## 7 910.2
## 8 NA
3.3: extract features price## label step_major step_minor label_minor bgn
## 1 extract.features.price.bgn 1 0 0 1019.211
## end elapsed
## 1 NA NA
## label step_major step_minor label_minor bgn
## 8 extract.features.price 3 3 3 996.684
## 9 extract.features.text 3 4 4 1019.222
## end elapsed
## 8 1019.221 22.537
## 9 NA NA
3.4: extract features text## label step_major step_minor label_minor bgn end
## 1 extract.features.text.bgn 1 0 0 1019.276 NA
## elapsed
## 1 NA
## label step_major step_minor label_minor bgn
## 9 extract.features.text 3 4 4 1019.222
## 10 extract.features.string 3 5 5 1019.286
## end elapsed
## 9 1019.286 0.064
## 10 NA NA
3.5: extract features string## label step_major step_minor label_minor bgn
## 1 extract.features.string.bgn 1 0 0 1019.318
## end elapsed
## 1 NA NA
## label step_major step_minor
## 1 extract.features.string.bgn 1 0
## 2 extract.features.stringfactorize.str.vars 2 0
## label_minor bgn end elapsed
## 1 0 1019.318 1019.327 0.01
## 2 0 1019.328 NA NA
## .src ImageId Image.pxl.1.dgt.1
## ".src" "ImageId" "Image.pxl.1.dgt.1"
## label step_major step_minor label_minor bgn
## 10 extract.features.string 3 5 5 1019.286
## 11 extract.features.end 3 6 6 1019.343
## end elapsed
## 10 1019.342 0.056
## 11 NA NA
3.6: extract features end## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## label step_major step_minor label_minor bgn
## 11 extract.features.end 3 6 6 1019.343
## 12 manage.missing.data 4 0 0 1020.227
## end elapsed
## 11 1020.226 0.883
## 12 NA NA
4.0: manage missing data4.0: manage missing data4.0: manage missing data4.0: manage missing data4.0: manage missing datafit.models_0_chunk_df <- myadd_chunk(NULL, "fit.models_0_bgn", label.minor = "setup")
# load(paste0(glbOut$pfx, "dsk.RData"))
get_model_sel_frmla <- function() {
model_evl_terms <- c(NULL)
# min.aic.fit might not be avl
lclMdlEvlCriteria <-
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)]
for (metric in lclMdlEvlCriteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse = " "))
return(model_sel_frmla)
}
get_dsp_models_df <- function() {
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
dsp_models_df <-
#orderBy(get_model_sel_frmla(), glb_models_df)[, c("id", glbMdlMetricsEval)]
orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols]
nCvMdl <- sapply(glb_models_lst, function(mdl) nrow(mdl$results))
nParams <- sapply(glb_models_lst, function(mdl) ifelse(mdl$method == "custom", 0,
nrow(subset(modelLookup(mdl$method), parameter != "parameter"))))
# nCvMdl <- nCvMdl[names(nCvMdl) != "avNNet"]
# nParams <- nParams[names(nParams) != "avNNet"]
if (length(cvMdlProblems <- nCvMdl[nCvMdl <= nParams]) > 0) {
print("Cross Validation issues:")
warning("Cross Validation issues:")
print(cvMdlProblems)
}
pltMdls <- setdiff(names(nCvMdl), names(cvMdlProblems))
pltMdls <- setdiff(pltMdls, names(nParams[nParams == 0]))
# length(pltMdls) == 21
png(paste0(glbOut$pfx, "bestTune.png"), width = 480 * 2, height = 480 * 4)
grid.newpage()
pushViewport(viewport(layout = grid.layout(ceiling(length(pltMdls) / 2.0), 2)))
pltIx <- 1
for (mdlId in pltMdls) {
print(ggplot(glb_models_lst[[mdlId]], highBestTune = TRUE) + labs(title = mdlId),
vp = viewport(layout.pos.row = ceiling(pltIx / 2.0),
layout.pos.col = ((pltIx - 1) %% 2) + 1))
pltIx <- pltIx + 1
}
dev.off()
if (all(row.names(dsp_models_df) != dsp_models_df$id))
row.names(dsp_models_df) <- dsp_models_df$id
return(dsp_models_df)
}
#get_dsp_models_df()
if (glb_is_classification && glb_is_binomial &&
(length(unique(glbObsFit[, glb_rsp_var])) < 2))
stop("glbObsFit$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glbObsFit[, glb_rsp_var]), collapse=", "))
max_cor_y_x_vars <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !nzv & !is.cor.y.abs.low &
is.na(cor.high.X)))[1:2, "id"]
max_cor_y_x_vars <- max_cor_y_x_vars[!is.na(max_cor_y_x_vars)]
if (length(max_cor_y_x_vars) < 2)
max_cor_y_x_vars <- union(max_cor_y_x_vars, ".pos")
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_vars[1] != glb_Baseline_mdl_var) &
(glb_feats_df[glb_feats_df$id == max_cor_y_x_vars[1], "cor.y.abs"] >
glb_feats_df[glb_feats_df$id == glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_vars[1], " has a higher correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Model specs
c("id.prefix", "method", "type",
# trainControl params
"preProc.method", "cv.n.folds", "cv.n.repeats", "summary.fn",
# train params
"metric", "metric.maximize", "tune.df")
# Baseline
if (!is.null(glb_Baseline_mdl_var)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Baseline"), major.inc = FALSE,
label.minor = "mybaseln_classfr")
ret_lst <- myfit_mdl(mdl_id="Baseline",
model_method="mybaseln_classfr",
indep_vars_vctr=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
}
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
if (glb_is_classification) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "MFO"), major.inc = FALSE,
label.minor = "myMFO_classfr")
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "MFO", type = glb_model_type, trainControl.method = "none",
train.method = ifelse(glb_is_regression, "lm", "myMFO_classfr"))),
indep_vars = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
# "random" model - only for classification;
# none needed for regression since it is same as MFO
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Random"), major.inc = FALSE,
label.minor = "myrandom_classfr")
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Random", type = glb_model_type, trainControl.method = "none",
train.method = "myrandom_classfr")),
indep_vars = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.rcv.*X*"), major.inc = FALSE,
label.minor = "glmnet")
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y.rcv.1X1", type=glb_model_type, trainControl.method="none",
train.method="glmnet")),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
if (glbMdlCheckRcv) {
# rcv_n_folds == 1 & rcv_n_repeats > 1 crashes
for (rcv_n_folds in seq(3, glb_rcv_n_folds + 2, 2))
for (rcv_n_repeats in seq(1, glb_rcv_n_repeats + 2, 2)) {
# Experiment specific code to avoid caret crash
# lcl_tune_models_df <- rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha",
# vals = "0.100 0.325 0.550 0.775 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda",
# vals = "9.342e-02")
# )
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
list(
id.prefix = paste0("Max.cor.Y.rcv.", rcv_n_folds, "X", rcv_n_repeats),
type = glb_model_type,
# tune.df = lcl_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = rcv_n_folds,
trainControl.repeats = rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.method = "glmnet", train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize)),
indep_vars = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
# Add parallel coordinates graph of glb_models_df[, glbMdlMetricsEval] to evaluate cv parameters
tmp_models_cols <- c("id", "max.nTuningRuns",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
print(myplot_parcoord(obs_df = subset(glb_models_df,
grepl("Max.cor.Y.rcv.", id, fixed = TRUE),
select = -feats)[, tmp_models_cols],
id_var = "id"))
}
# Useful for stacking decisions
# fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
# paste0("fit.models_0_", "Max.cor.Y[rcv.1X1.cp.0|]"), major.inc = FALSE,
# label.minor = "rpart")
#
# ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
# id.prefix = "Max.cor.Y.rcv.1X1.cp.0", type = glb_model_type, trainControl.method = "none",
# train.method = "rpart",
# tune.df=data.frame(method="rpart", parameter="cp", min=0.0, max=0.0, by=0.1))),
# indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB)
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
# if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y",
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "rpart")),
indep_vars = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll))) > 0)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.Time.Poly"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjust_interaction_feats(indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Poly",
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indep_vars = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll))) > 0)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.Time.Lag"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjust_interaction_feats(indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Lag",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indep_vars = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
if (length(glbFeatsText) > 0) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Txt.*"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars)
for (txtFeat in names(glbFeatsText))
indepVars <- union(indepVars,
grep(paste(str_to_upper(substr(txtFeat, 1, 1)), "\\.(?!([T|P]\\.))", sep = ""),
names(glbObsAll), perl = TRUE, value = TRUE))
indepVars <- myadjust_interaction_feats(indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Text.nonTP",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indep_vars = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
indepVars <- c(max_cor_y_x_vars)
for (txtFeat in names(glbFeatsText))
indepVars <- union(indepVars,
grep(paste(str_to_upper(substr(txtFeat, 1, 1)), "\\.T\\.", sep = ""),
names(glbObsAll), perl = TRUE, value = TRUE))
indepVars <- myadjust_interaction_feats(indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Text.onlyT",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indep_vars = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
indepVars <- c(max_cor_y_x_vars)
for (txtFeat in names(glbFeatsText))
indepVars <- union(indepVars,
grep(paste(str_to_upper(substr(txtFeat, 1, 1)), "\\.P\\.", sep = ""),
names(glbObsAll), perl = TRUE, value = TRUE))
indepVars <- myadjust_interaction_feats(indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Text.onlyP",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indep_vars = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(setdiff(unique(glb_feats_df$cor.high.X), NA),
subset(glb_feats_df, nzv)$id)) > 0) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Interact.High.cor.Y"), major.inc = FALSE,
label.minor = "glmnet")
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Interact.High.cor.Y",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indep_vars=c(max_cor_y_x_vars, paste(max_cor_y_x_vars[1], int_feats, sep=":")),
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
}
# Low.cor.X
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Low.cor.X"), major.inc = FALSE,
label.minor = "glmnet")
indep_vars <- subset(glb_feats_df, is.na(cor.high.X) & !nzv &
(exclude.as.feat != 1))[, "id"]
indep_vars <- myadjust_interaction_feats(indep_vars)
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Low.cor.X",
type=glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indep_vars=indep_vars, rsp_var=glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
fit.models_0_chunk_df <-
myadd_chunk(fit.models_0_chunk_df, "fit.models_0_end", major.inc = FALSE,
label.minor = "teardown")
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc = FALSE)
fit.models_1_chunk_df <- myadd_chunk(NULL, "fit.models_1_bgn", label.minor = "setup")
# refactor code for outliers / ensure all model runs exclude outliers in this chunk ???
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
topindep_var <- NULL; interact_vars <- NULL;
for (mdl_id_pfx in names(glbMdlFamilies)) {
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, paste0("fit.models_1_", mdl_id_pfx),
major.inc = FALSE, label.minor = "setup")
indep_vars <- NULL;
if (grepl("\\.Interact", mdl_id_pfx)) {
if (is.null(topindep_var) && is.null(interact_vars)) {
# select best glmnet model upto now
dsp_models_df <- orderBy(model_sel_frmla <- get_model_sel_frmla(),
glb_models_df)
dsp_models_df <- subset(dsp_models_df,
grepl(".glmnet", id, fixed = TRUE))
bst_mdl_id <- dsp_models_df$id[1]
mdl_id_pfx <-
paste(c(head(unlist(strsplit(bst_mdl_id, "[.]")), -1), "Interact"),
collapse=".")
# select important features
if (is.null(bst_featsimp_df <-
myget_feats_importance(glb_models_lst[[bst_mdl_id]]))) {
warning("Base model for RFE.Interact: ", bst_mdl_id,
" has no important features")
next
}
topindep_ix <- 1
while (is.null(topindep_var) && (topindep_ix <= nrow(bst_featsimp_df))) {
topindep_var <- row.names(bst_featsimp_df)[topindep_ix]
if (grepl(".fctr", topindep_var, fixed=TRUE))
topindep_var <-
paste0(unlist(strsplit(topindep_var, ".fctr"))[1], ".fctr")
if (topindep_var %in% names(glbFeatsInteractionOnly)) {
topindep_var <- NULL; topindep_ix <- topindep_ix + 1
} else break
}
# select features with importance > max(10, importance of .rnorm) & is not highest
# combine factor dummy features to just the factor feature
if (length(pos_rnorm <-
grep(".rnorm", row.names(bst_featsimp_df), fixed=TRUE)) > 0)
imp_rnorm <- bst_featsimp_df[pos_rnorm, 1] else
imp_rnorm <- NA
imp_cutoff <- max(10, imp_rnorm, na.rm=TRUE)
interact_vars <-
tail(row.names(subset(bst_featsimp_df,
imp > imp_cutoff)), -1)
if (length(interact_vars) > 0) {
interact_vars <-
myadjust_interaction_feats(myextract_actual_feats(interact_vars))
interact_vars <-
interact_vars[!grepl(topindep_var, interact_vars, fixed=TRUE)]
}
### bid0_sp only
# interact_vars <- c(
# "biddable", "D.ratio.sum.TfIdf.wrds.n", "D.TfIdf.sum.stem.stop.Ratio", "D.sum.TfIdf",
# "D.TfIdf.sum.post.stop", "D.TfIdf.sum.post.stem", "D.ratio.wrds.stop.n.wrds.n", "D.chrs.uppr.n.log",
# "D.chrs.n.log", "color.fctr"
# # , "condition.fctr", "prdl.my.descr.fctr"
# )
# interact_vars <- setdiff(interact_vars, c("startprice.dgt2.is9", "color.fctr"))
###
indep_vars <- myextract_actual_feats(row.names(bst_featsimp_df))
indep_vars <- setdiff(indep_vars, topindep_var)
if (length(interact_vars) > 0) {
indep_vars <-
setdiff(indep_vars, myextract_actual_feats(interact_vars))
indep_vars <- c(indep_vars,
paste(topindep_var, setdiff(interact_vars, topindep_var),
sep = "*"))
} else indep_vars <- union(indep_vars, topindep_var)
}
}
if (is.null(indep_vars))
indep_vars <- glb_mdl_feats_lst[[mdl_id_pfx]]
if (is.null(indep_vars) && grepl("RFE\\.", mdl_id_pfx))
indep_vars <- myextract_actual_feats(predictors(rfe_fit_results))
if (is.null(indep_vars))
indep_vars <- subset(glb_feats_df, !nzv & (exclude.as.feat != 1))[, "id"]
if ((length(indep_vars) == 1) && (grepl("^%<d-%", indep_vars))) {
indep_vars <-
eval(parse(text = str_trim(unlist(strsplit(indep_vars, "%<d-%"))[2])))
}
indep_vars <- myadjust_interaction_feats(indep_vars)
if (grepl("\\.Interact", mdl_id_pfx)) {
# if (method != tail(unlist(strsplit(bst_mdl_id, "[.]")), 1)) next
if (is.null(glbMdlFamilies[[mdl_id_pfx]])) {
if (!is.null(glbMdlFamilies[["Best.Interact"]]))
glbMdlFamilies[[mdl_id_pfx]] <-
glbMdlFamilies[["Best.Interact"]]
}
}
if (!is.null(glbObsFitOutliers[[mdl_id_pfx]])) {
fitobs_df <- glbObsFit[!(glbObsFit[, glbFeatsId] %in%
glbObsFitOutliers[[mdl_id_pfx]]), ]
print(sprintf("Outliers removed: %d", nrow(glbObsFit) - nrow(fitobs_df)))
print(setdiff(glbObsFit[, glbFeatsId], fitobs_df[, glbFeatsId]))
} else fitobs_df <- glbObsFit
if (is.null(glbMdlFamilies[[mdl_id_pfx]]))
mdl_methods <- glbMdlMethods else
mdl_methods <- glbMdlFamilies[[mdl_id_pfx]]
for (method in mdl_methods) {
if (method %in% c("rpart", "rf")) {
# rpart: fubar's the tree
# rf: skip the scenario w/ .rnorm for speed
indep_vars <- setdiff(indep_vars, c(".rnorm"))
#mdl_id <- paste0(mdl_id_pfx, ".no.rnorm")
}
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df,
paste0("fit.models_1_", mdl_id_pfx), major.inc = FALSE,
label.minor = method)
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv", # or "none" if nominalWorkflow is crashing
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indep_vars = indep_vars, rsp_var = glb_rsp_var,
fit_df = fitobs_df, OOB_df = glbObsOOB)
# ntv_mdl <- glmnet(x = as.matrix(
# fitobs_df[, indep_vars]),
# y = as.factor(as.character(
# fitobs_df[, glb_rsp_var])),
# family = "multinomial")
# bgn = 1; end = 100;
# ntv_mdl <- glmnet(x = as.matrix(
# subset(fitobs_df, pop.fctr != "crypto")[bgn:end, indep_vars]),
# y = as.factor(as.character(
# subset(fitobs_df, pop.fctr != "crypto")[bgn:end, glb_rsp_var])),
# family = "multinomial")
}
}
# Check if other preProcess methods improve model performance
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, "fit.models_1_preProc", major.inc = FALSE,
label.minor = "preProc")
mdl_id <- orderBy(get_model_sel_frmla(), glb_models_df)[1, "id"]
indep_vars_vctr <- trim(unlist(strsplit(glb_models_df[glb_models_df$id == mdl_id,
"feats"], "[,]")))
method <- tail(unlist(strsplit(mdl_id, "[.]")), 1)
mdl_id_pfx <- paste0(head(unlist(strsplit(mdl_id, "[.]")), -1), collapse = ".")
if (!is.null(glbObsFitOutliers[[mdl_id_pfx]])) {
fitobs_df <- glbObsFit[!(glbObsFit[, glbFeatsId] %in%
glbObsFitOutliers[[mdl_id_pfx]]), ]
print(sprintf("Outliers removed: %d", nrow(glbObsFit) - nrow(fitobs_df)))
print(setdiff(glbObsFit[, glbFeatsId], fitobs_df[, glbFeatsId]))
} else fitobs_df <- glbObsFit
for (prePr in glb_preproc_methods) {
# The operations are applied in this order:
# Box-Cox/Yeo-Johnson transformation, centering, scaling, range, imputation, PCA, ICA then spatial sign.
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix=mdl_id_pfx,
type=glb_model_type, tune.df=glbMdlTuneParams,
trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds,
trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method=method, train.preProcess=prePr)),
indep_vars=indep_vars_vctr, rsp_var=glb_rsp_var,
fit_df=fitobs_df, OOB_df=glbObsOOB)
}
# If (All|RFE).X.glm is less accurate than Low.Cor.X.glm
# check NA coefficients & filter appropriate terms in indep_vars_vctr
# if (method == "glm") {
# orig_glm <- glb_models_lst[[paste0(mdl_id, ".", model_method)]]$finalModel
# orig_glm <- glb_models_lst[["All.X.glm"]]$finalModel; print(summary(orig_glm))
# orig_glm <- glb_models_lst[["RFE.X.glm"]]$finalModel; print(summary(orig_glm))
# require(car)
# vif_orig_glm <- vif(orig_glm); print(vif_orig_glm)
# # if vif errors out with "there are aliased coefficients in the model"
# alias_orig_glm <- alias(orig_glm); alias_complete_orig_glm <- (alias_orig_glm$Complete > 0); alias_complete_orig_glm <- alias_complete_orig_glm[rowSums(alias_complete_orig_glm) > 0, colSums(alias_complete_orig_glm) > 0]; print(alias_complete_orig_glm)
# print(vif_orig_glm[!is.na(vif_orig_glm) & (vif_orig_glm == Inf)])
# print(which.max(vif_orig_glm))
# print(sort(vif_orig_glm[vif_orig_glm >= 1.0e+03], decreasing=TRUE))
# glbObsFit[c(1143, 3637, 3953, 4105), c("UniqueID", "Popular", "H.P.quandary", "Headline")]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE), ]
# all.equal(glbObsAll$S.chrs.uppr.n.log, glbObsAll$A.chrs.uppr.n.log)
# cor(glbObsAll$S.T.herald, glbObsAll$S.T.tribun)
# mydspObs(Abstract.contains="[Dd]iar", cols=("Abstract"), all=TRUE)
# subset(glb_feats_df, cor.y.abs <= glb_feats_df[glb_feats_df$id == ".rnorm", "cor.y.abs"])
# corxx_mtrx <- cor(data.matrix(glbObsAll[, setdiff(names(glbObsAll), myfind_chr_cols_df(glbObsAll))]), use="pairwise.complete.obs"); abs_corxx_mtrx <- abs(corxx_mtrx); diag(abs_corxx_mtrx) <- 0
# which.max(abs_corxx_mtrx["S.T.tribun", ])
# abs_corxx_mtrx["A.npnct08.log", "S.npnct08.log"]
# step_glm <- step(orig_glm)
# }
# Since caret does not optimize rpart well
# if (method == "rpart")
# ret_lst <- myfit_mdl(mdl_id=paste0(mdl_id_pfx, ".cp.0"), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=0, tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
# User specified
# Ensure at least 2 vars in each regression; else varImp crashes
# sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df; sav_featsimp_df <- glb_featsimp_df; all.equal(sav_featsimp_df, glb_featsimp_df)
# glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df; glm_featsimp_df <- sav_featsimp_df
# easier to exclude features
# require(gdata) # needed for trim
# mdl_id <- "";
# indep_vars_vctr <- head(subset(glb_models_df, grepl("All\\.X\\.", mdl_id), select=feats)
# , 1)[, "feats"]
# indep_vars_vctr <- trim(unlist(strsplit(indep_vars_vctr, "[,]")))
# indep_vars_vctr <- setdiff(indep_vars_vctr, ".rnorm")
# easier to include features
#stop(here"); sav_models_df <- glb_models_df; glb_models_df <- sav_models_df
# !_sp
# mdl_id <- "csm"; indep_vars_vctr <- c(NULL
# ,"prdline.my.fctr", "prdline.my.fctr:.clusterid.fctr"
# ,"prdline.my.fctr*biddable"
# #,"prdline.my.fctr*startprice.log"
# #,"prdline.my.fctr*startprice.diff"
# ,"prdline.my.fctr*condition.fctr"
# ,"prdline.my.fctr*D.terms.post.stop.n"
# #,"prdline.my.fctr*D.terms.post.stem.n"
# ,"prdline.my.fctr*cellular.fctr"
# # ,"<feat1>:<feat2>"
# )
# for (method in glbMdlMethods) {
# ret_lst <- myfit_mdl(mdl_id=mdl_id, model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glbMdlTuneParams)
# csm_mdl_id <- paste0(mdl_id, ".", method)
# csm_featsimp_df <- myget_feats_importance(glb_models_lst[[paste0(mdl_id, ".",
# method)]]); print(head(csm_featsimp_df))
# }
###
# Ntv.1.lm <- lm(reformulate(indep_vars_vctr, glb_rsp_var), glbObsTrn); print(summary(Ntv.1.lm))
#glb_models_df[, "max.Accuracy.OOB", FALSE]
#varImp(glb_models_lst[["Low.cor.X.glm"]])
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.2.glm"]])$imp)
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.3.glm"]])$imp)
#glb_feats_df[grepl("npnct28", glb_feats_df$id), ]
# User specified bivariate models
# indep_vars_vctr_lst <- list()
# for (feat in setdiff(names(glbObsFit),
# union(glb_rsp_var, glbFeatsExclude)))
# indep_vars_vctr_lst[["feat"]] <- feat
# User specified combinatorial models
# indep_vars_vctr_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
# template for myfit_mdl
# rf is hard-coded in caret to recognize only Accuracy / Kappa evaluation metrics
# only for OOB in trainControl ?
# ret_lst <- myfit_mdl_fn(mdl_id=paste0(mdl_id_pfx, ""), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glbMdlTuneParams,
# model_loss_mtrx=glbMdlMetric_terms,
# model_summaryFunction=glbMdlMetricSummaryFn,
# model_metric=glbMdlMetricSummary,
# model_metric_maximize=glbMdlMetricMaximize)
# Simplify a model
# fit_df <- glbObsFit; glb_mdl <- step(<complex>_mdl)
# Non-caret models
# rpart_area_mdl <- rpart(reformulate("Area", response=glb_rsp_var),
# data=glbObsFit, #method="class",
# control=rpart.control(cp=0.12),
# parms=list(loss=glbMdlMetric_terms))
# print("rpart_sel_wlm_mdl"); prp(rpart_sel_wlm_mdl)
#
print(glb_models_df)
rm(ret_lst)
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, "fit.models_1_end", major.inc = FALSE,
label.minor = "teardown")
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc = FALSE)
fit.models_2_chunk_df <-
myadd_chunk(NULL, "fit.models_2_bgn", label.minor = "setup")
plt_models_df <- glb_models_df[, -grep("SD|Upper|Lower", names(glb_models_df))]
for (var in grep("^min.", names(plt_models_df), value=TRUE)) {
plt_models_df[, sub("min.", "inv.", var)] <-
#ifelse(all(is.na(tmp <- plt_models_df[, var])), NA, 1.0 / tmp)
1.0 / plt_models_df[, var]
plt_models_df <- plt_models_df[ , -grep(var, names(plt_models_df))]
}
print(plt_models_df)
# print(myplot_radar(radar_inp_df=plt_models_df))
# print(myplot_radar(radar_inp_df=subset(plt_models_df,
# !(mdl_id %in% grep("random|MFO", plt_models_df$id, value=TRUE)))))
# Compute CI for <metric>SD
glb_models_df <- mutate(glb_models_df,
max.df = ifelse(max.nTuningRuns > 1, max.nTuningRuns - 1, NA),
min.sd2ci.scaler = ifelse(is.na(max.df), NA, qt(0.975, max.df)))
for (var in grep("SD", names(glb_models_df), value=TRUE)) {
# Does CI alredy exist ?
var_components <- unlist(strsplit(var, "SD"))
varActul <- paste0(var_components[1], var_components[2])
varUpper <- paste0(var_components[1], "Upper", var_components[2])
varLower <- paste0(var_components[1], "Lower", var_components[2])
if (varUpper %in% names(glb_models_df)) {
warning(varUpper, " already exists in glb_models_df")
# Assuming Lower also exists
next
}
print(sprintf("var:%s", var))
# CI is dependent on sample size in t distribution; df=n-1
glb_models_df[, varUpper] <- glb_models_df[, varActul] +
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
glb_models_df[, varLower] <- glb_models_df[, varActul] -
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
}
# Plot metrics with CI
plt_models_df <- glb_models_df[, "id", FALSE]
pltCI_models_df <- glb_models_df[, "id", FALSE]
for (var in grep("Upper", names(glb_models_df), value=TRUE)) {
var_components <- unlist(strsplit(var, "Upper"))
col_name <- unlist(paste(var_components, collapse=""))
plt_models_df[, col_name] <- glb_models_df[, col_name]
for (name in paste0(var_components[1], c("Upper", "Lower"), var_components[2]))
pltCI_models_df[, name] <- glb_models_df[, name]
}
build_statsCI_data <- function(plt_models_df) {
mltd_models_df <- melt(plt_models_df, id.vars="id")
mltd_models_df$data <- sapply(1:nrow(mltd_models_df),
function(row_ix) tail(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]), "[.]")), 1))
mltd_models_df$label <- sapply(1:nrow(mltd_models_df),
function(row_ix) head(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]),
paste0(".", mltd_models_df[row_ix, "data"]))), 1))
#print(mltd_models_df)
return(mltd_models_df)
}
mltd_models_df <- build_statsCI_data(plt_models_df)
mltdCI_models_df <- melt(pltCI_models_df, id.vars="id")
for (row_ix in 1:nrow(mltdCI_models_df)) {
for (type in c("Upper", "Lower")) {
if (length(var_components <- unlist(strsplit(
as.character(mltdCI_models_df[row_ix, "variable"]), type))) > 1) {
#print(sprintf("row_ix:%d; type:%s; ", row_ix, type))
mltdCI_models_df[row_ix, "label"] <- var_components[1]
mltdCI_models_df[row_ix, "data"] <-
unlist(strsplit(var_components[2], "[.]"))[2]
mltdCI_models_df[row_ix, "type"] <- type
break
}
}
}
wideCI_models_df <- reshape(subset(mltdCI_models_df, select=-variable),
timevar="type",
idvar=setdiff(names(mltdCI_models_df), c("type", "value", "variable")),
direction="wide")
#print(wideCI_models_df)
mrgdCI_models_df <- merge(wideCI_models_df, mltd_models_df, all.x=TRUE)
#print(mrgdCI_models_df)
# Merge stats back in if CIs don't exist
goback_vars <- c()
for (var in unique(mltd_models_df$label)) {
for (type in unique(mltd_models_df$data)) {
var_type <- paste0(var, ".", type)
# if this data is already present, next
if (var_type %in% unique(paste(mltd_models_df$label, mltd_models_df$data,
sep=".")))
next
#print(sprintf("var_type:%s", var_type))
goback_vars <- c(goback_vars, var_type)
}
}
if (length(goback_vars) > 0) {
mltd_goback_df <- build_statsCI_data(glb_models_df[, c("id", goback_vars)])
mltd_models_df <- rbind(mltd_models_df, mltd_goback_df)
}
# mltd_models_df <- merge(mltd_models_df, glb_models_df[, c("id", "model_method")],
# all.x=TRUE)
png(paste0(glbOut$pfx, "models_bar.png"), width=480*3, height=480*2)
#print(gp <- myplot_bar(mltd_models_df, "id", "value", colorcol_name="model_method") +
print(gp <- myplot_bar(df=mltd_models_df, xcol_name="id", ycol_names="value") +
geom_errorbar(data=mrgdCI_models_df,
mapping=aes(x=mdl_id, ymax=value.Upper, ymin=value.Lower), width=0.5) +
facet_grid(label ~ data, scales="free") +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)))
dev.off()
print(gp)
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
# if (glb_is_classification && glb_is_binomial)
# dsp_models_cols <- c(dsp_models_cols, "opt.prob.threshold.OOB")
print(dsp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols])
# print(myplot_radar(radar_inp_df = dsp_models_df))
print("Metrics used for model selection:"); print(get_model_sel_frmla())
print(sprintf("Best model id: %s", dsp_models_df[1, "id"]))
glb_get_predictions <- function(df, mdl_id, rsp_var, prob_threshold_def=NULL, verbose=FALSE) {
mdl <- glb_models_lst[[mdl_id]]
clmnNames <- mygetPredictIds(rsp_var, mdl_id)
predct_var_name <- clmnNames$value
predct_prob_var_name <- clmnNames$prob
predct_accurate_var_name <- clmnNames$is.acc
predct_error_var_name <- clmnNames$err
predct_erabs_var_name <- clmnNames$err.abs
if (glb_is_regression) {
df[, predct_var_name] <- predict(mdl, newdata=df, type="raw")
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] - df[, glb_rsp_var]
if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="auto"))
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_erabs_var_name] <- abs(df[, predct_error_var_name])
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, predct_prob_var_name] <- predict(mdl, newdata = df, type = "prob")[, 2]
df[, predct_var_name] <-
factor(levels(df[, glb_rsp_var])[
(df[, predct_prob_var_name] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
# facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] != df[, glb_rsp_var]
# if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="auto"))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
# if prediction is a TP (true +ve), measure distance from 1.0
tp <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[tp, predct_erabs_var_name] <- abs(1 - df[tp, predct_prob_var_name])
#rowIx <- which.max(df[tp, predct_erabs_var_name]); df[tp, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a TN (true -ve), measure distance from 0.0
tn <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[tn, predct_erabs_var_name] <- abs(0 - df[tn, predct_prob_var_name])
#rowIx <- which.max(df[tn, predct_erabs_var_name]); df[tn, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FP (flse +ve), measure distance from 0.0
fp <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[fp, predct_erabs_var_name] <- abs(0 - df[fp, predct_prob_var_name])
#rowIx <- which.max(df[fp, predct_erabs_var_name]); df[fp, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FN (flse -ve), measure distance from 1.0
fn <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[fn, predct_erabs_var_name] <- abs(1 - df[fn, predct_prob_var_name])
#rowIx <- which.max(df[fn, predct_erabs_var_name]); df[fn, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && !glb_is_binomial) {
df[, predct_var_name] <- predict(mdl, newdata = df, type = "raw")
probCls <- predict(mdl, newdata = df, type = "prob")
df[, predct_prob_var_name] <- NA
for (cls in names(probCls)) {
mask <- (df[, predct_var_name] == cls)
df[mask, predct_prob_var_name] <- probCls[mask, cls]
}
if (verbose) print(myplot_histogram(df, predct_prob_var_name,
fill_col_name = predct_var_name))
if (verbose) print(myplot_histogram(df, predct_prob_var_name,
facet_frmla = paste0("~", glb_rsp_var)))
df[, predct_error_var_name] <- df[, predct_var_name] != df[, glb_rsp_var]
# if prediction is erroneous, measure predicted class prob from actual class prob
df[, predct_erabs_var_name] <- 0
for (cls in names(probCls)) {
mask <- (df[, glb_rsp_var] == cls) & (df[, predct_error_var_name])
df[mask, predct_erabs_var_name] <- probCls[mask, cls]
}
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
return(df)
}
#stop(here"); glb2Sav(); glbObsAll <- savObsAll; glbObsTrn <- savObsTrn; glbObsFit <- savObsFit; glbObsOOB <- savObsOOB; sav_models_df <- glb_models_df; glb_models_df <- sav_models_df; glb_featsimp_df <- sav_featsimp_df
myget_category_stats <- function(obs_df, mdl_id, label) {
require(dplyr)
require(lazyeval)
predct_var_name <- mygetPredictIds(glb_rsp_var, mdl_id)$value
predct_error_var_name <- mygetPredictIds(glb_rsp_var, mdl_id)$err.abs
if (!predct_var_name %in% names(obs_df))
obs_df <- glb_get_predictions(obs_df, mdl_id, glb_rsp_var)
tmp_obs_df <- obs_df[, c(glbFeatsCategory, glb_rsp_var,
predct_var_name, predct_error_var_name)]
# tmp_obs_df <- obs_df %>%
# dplyr::select_(glbFeatsCategory, glb_rsp_var, predct_var_name, predct_error_var_name)
#dplyr::rename(startprice.log10.predict.RFE.X.glmnet.err=error_abs_OOB)
names(tmp_obs_df)[length(names(tmp_obs_df))] <- paste0("err.abs.", label)
ret_ctgry_df <- tmp_obs_df %>%
dplyr::group_by_(glbFeatsCategory) %>%
dplyr::summarise_(#interp(~sum(abs(var)), var=as.name(glb_rsp_var)),
interp(~sum(var), var=as.name(paste0("err.abs.", label))),
interp(~mean(var), var=as.name(paste0("err.abs.", label))),
interp(~n()))
names(ret_ctgry_df) <- c(glbFeatsCategory,
#paste0(glb_rsp_var, ".abs.", label, ".sum"),
paste0("err.abs.", label, ".sum"),
paste0("err.abs.", label, ".mean"),
paste0(".n.", label))
ret_ctgry_df <- dplyr::ungroup(ret_ctgry_df)
#colSums(ret_ctgry_df[, -grep(glbFeatsCategory, names(ret_ctgry_df))])
return(ret_ctgry_df)
}
#print(colSums((ctgry_df <- myget_category_stats(obs_df=glbObsFit, mdl_id="", label="fit"))[, -grep(glbFeatsCategory, names(ctgry_df))]))
if (!is.null(glb_mdl_ensemble)) {
fit.models_2_chunk_df <- myadd_chunk(fit.models_2_chunk_df,
paste0("fit.models_2_", mdl_id_pfx), major.inc = TRUE,
label.minor = "ensemble")
mdl_id_pfx <- "Ensemble"
if (#(glb_is_regression) |
((glb_is_classification) & (!glb_is_binomial)))
stop("Ensemble models not implemented yet for multinomial classification")
mygetEnsembleAutoMdlIds <- function() {
tmp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)
row.names(tmp_models_df) <- tmp_models_df$id
mdl_threshold_pos <-
min(which(grepl("MFO|Random|Baseline", tmp_models_df$id))) - 1
mdlIds <- tmp_models_df$id[1:mdl_threshold_pos]
return(mdlIds[!grepl("Ensemble", mdlIds)])
}
if (glb_mdl_ensemble == "auto") {
glb_mdl_ensemble <- mygetEnsembleAutoMdlIds()
mdl_id_pfx <- paste0(mdl_id_pfx, ".auto")
} else if (grepl("^%<d-%", glb_mdl_ensemble)) {
glb_mdl_ensemble <- eval(parse(text =
str_trim(unlist(strsplit(glb_mdl_ensemble, "%<d-%"))[2])))
}
for (mdl_id in glb_mdl_ensemble) {
if (!(mdl_id %in% names(glb_models_lst))) {
warning("Model ", mdl_id, " in glb_model_ensemble not found !")
next
}
glbObsFit <- glb_get_predictions(df = glbObsFit, mdl_id, glb_rsp_var)
glbObsOOB <- glb_get_predictions(df = glbObsOOB, mdl_id, glb_rsp_var)
}
#mdl_id_pfx <- "Ensemble.RFE"; mdlId <- paste0(mdl_id_pfx, ".glmnet")
#glb_mdl_ensemble <- gsub(mygetPredictIds$value, "", grep("RFE\\.X\\.(?!Interact)", row.names(glb_featsimp_df), perl = TRUE, value = TRUE), fixed = TRUE)
#varImp(glb_models_lst[[mdlId]])
#cor_df <- data.frame(cor=cor(glbObsFit[, glb_rsp_var], glbObsFit[, paste(mygetPredictIds$value, glb_mdl_ensemble)], use="pairwise.complete.obs"))
#glbObsFit <- glb_get_predictions(df=glbObsFit, "Ensemble.glmnet", glb_rsp_var);print(colSums((ctgry_df <- myget_category_stats(obs_df=glbObsFit, mdl_id="Ensemble.glmnet", label="fit"))[, -grep(glbFeatsCategory, names(ctgry_df))]))
### bid0_sp
# Better than MFO; models.n=28; min.RMSE.fit=0.0521233; err.abs.fit.sum=7.3631895
# old: Top x from auto; models.n= 5; min.RMSE.fit=0.06311047; err.abs.fit.sum=9.5937080
# RFE only ; models.n=16; min.RMSE.fit=0.05148588; err.abs.fit.sum=7.2875091
# RFE subset only ;models.n= 5; min.RMSE.fit=0.06040702; err.abs.fit.sum=9.059088
# RFE subset only ;models.n= 9; min.RMSE.fit=0.05933167; err.abs.fit.sum=8.7421288
# RFE subset only ;models.n=15; min.RMSE.fit=0.0584607; err.abs.fit.sum=8.5902066
# RFE subset only ;models.n=17; min.RMSE.fit=0.05496899; err.abs.fit.sum=8.0170431
# RFE subset only ;models.n=18; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
# RFE subset only ;models.n=16; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
### bid0_sp
### bid1_sp
# "auto"; err.abs.fit.sum=76.699774; min.RMSE.fit=0.2186429
# "RFE.X.*"; err.abs.fit.sum=; min.RMSE.fit=0.221114
### bid1_sp
indep_vars <- paste(mygetPredictIds(glb_rsp_var)$value, glb_mdl_ensemble, sep = "")
if (glb_is_classification)
indep_vars <- paste(indep_vars, ".prob", sep = "")
# Some models in glb_mdl_ensemble might not be fitted e.g. RFE.X.Interact
indep_vars <- intersect(indep_vars, names(glbObsFit))
# indep_vars <- grep(mygetPredictIds(glb_rsp_var)$value, names(glbObsFit), fixed=TRUE, value=TRUE)
# if (glb_is_regression)
# indep_vars <- indep_vars[!grepl("(err\\.abs|accurate)$", indep_vars)]
# if (glb_is_classification && glb_is_binomial)
# indep_vars <- grep("prob$", indep_vars, value=TRUE) else
# indep_vars <- indep_vars[!grepl("err$", indep_vars)]
#rfe_fit_ens_results <- myrun_rfe(glbObsFit, indep_vars)
for (method in c("glm", "glmnet")) {
for (trainControlMethod in
c("boot", "boot632", "cv", "repeatedcv"
#, "LOOCV" # tuneLength * nrow(fitDF)
, "LGOCV", "adaptive_cv"
#, "adaptive_boot" #error: adaptive$min should be less than 3
#, "adaptive_LGOCV" #error: adaptive$min should be less than 3
)) {
#sav_models_df <- glb_models_df; all.equal(sav_models_df, glb_models_df)
#glb_models_df <- sav_models_df; print(glb_models_df$id)
if ((method == "glm") && (trainControlMethod != "repeatedcv"))
# glm used only to identify outliers
next
ret_lst <- myfit_mdl(
mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = paste0(mdl_id_pfx, ".", trainControlMethod),
type = glb_model_type, tune.df = NULL,
trainControl.method = trainControlMethod,
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indep_vars = indep_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
}
dsp_models_df <- get_dsp_models_df()
}
if (is.null(glb_sel_mdl_id))
glb_sel_mdl_id <- dsp_models_df[1, "id"] else
print(sprintf("User specified selection: %s", glb_sel_mdl_id))
myprint_mdl(glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]])
# From here to save(), this should all be in one function
# these are executed in the same seq twice more:
# fit.data.training & predict.data.new chunks
print(sprintf("%s fit prediction diagnostics:", glb_sel_mdl_id))
glbObsFit <- glb_get_predictions(df = glbObsFit, mdl_id = glb_sel_mdl_id,
rsp_var = glb_rsp_var)
print(sprintf("%s OOB prediction diagnostics:", glb_sel_mdl_id))
glbObsOOB <- glb_get_predictions(df = glbObsOOB, mdl_id = glb_sel_mdl_id,
rsp_var = glb_rsp_var)
print(glb_featsimp_df <- myget_feats_importance(mdl = glb_sel_mdl, featsimp_df = NULL))
#mdl_id <-"RFE.X.glmnet"; glb_featsimp_df <- myget_feats_importance(glb_models_lst[[mdl_id]], glb_featsimp_df); glb_featsimp_df[, paste0(mdl_id, ".imp")] <- glb_featsimp_df$imp; print(glb_featsimp_df)
#print(head(sbst_featsimp_df <- subset(glb_featsimp_df, is.na(RFE.X.glmnet.imp) | (abs(RFE.X.YeoJohnson.glmnet.imp - RFE.X.glmnet.imp) > 0.0001), select=-imp)))
#print(orderBy(~ -cor.y.abs, subset(glb_feats_df, id %in% c(row.names(sbst_featsimp_df), "startprice.dcm1.is9", "D.weight.post.stop.sum"))))
# Used again in fit.data.training & predict.data.new chunks
glb_analytics_diag_plots <- function(obs_df, mdl_id, prob_threshold=NULL) {
if (!is.null(featsimp_df <- glb_featsimp_df)) {
featsimp_df$feat <- gsub("`(.*?)`", "\\1", row.names(featsimp_df))
featsimp_df$feat.interact <- gsub("(.*?):(.*)", "\\2", featsimp_df$feat)
featsimp_df$feat <- gsub("(.*?):(.*)", "\\1", featsimp_df$feat)
featsimp_df$feat.interact <-
ifelse(featsimp_df$feat.interact == featsimp_df$feat,
NA, featsimp_df$feat.interact)
featsimp_df$feat <-
gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat)
featsimp_df$feat.interact <-
gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat.interact)
featsimp_df <- orderBy(~ -imp.max,
summaryBy(imp ~ feat + feat.interact, data=featsimp_df,
FUN=max))
#rex_str=":(.*)"; txt_vctr=tail(featsimp_df$feat); ret_lst <- regexec(rex_str, txt_vctr); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
featsimp_df <- subset(featsimp_df, !is.na(imp.max))
if (nrow(featsimp_df) > 5) {
warning("Limiting important feature scatter plots to 5 out of ",
nrow(featsimp_df))
featsimp_df <- head(featsimp_df, 5)
}
# if (!all(is.na(featsimp_df$feat.interact)))
# stop("not implemented yet")
rsp_var_out <- mygetPredictIds(glb_rsp_var, mdl_id)$value
for (var in featsimp_df$feat) {
plot_df <- melt(obs_df, id.vars = var,
measure.vars = c(glb_rsp_var, rsp_var_out))
print(myplot_scatter(plot_df, var, "value", colorcol_name = "variable",
facet_colcol_name = "variable", jitter = TRUE) +
guides(color = FALSE))
}
}
if (glb_is_regression) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No important features in glb_fin_mdl") else
print(myplot_prediction_regression(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var, rsp_var_out=rsp_var_out,
id_vars=glbFeatsId)
# + facet_wrap(reformulate(featsimp_df$feat[2])) # if [1 or 2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df = obs_df,
feat_x = ifelse(nrow(featsimp_df) > 1,
featsimp_df$feat[2], ".rownames"),
feat_y = featsimp_df$feat[1],
rsp_var = glb_rsp_var,
rsp_var_out = rsp_var_out,
id_vars = glbFeatsId,
prob_threshold = prob_threshold))
}
}
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id,
prob_threshold = glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id)
if (!is.null(glbFeatsCategory)) {
glbLvlCategory <- merge(glbLvlCategory,
myget_category_stats(obs_df = glbObsFit, mdl_id = glb_sel_mdl_id,
label = "fit"),
by = glbFeatsCategory, all = TRUE)
row.names(glbLvlCategory) <- glbLvlCategory[, glbFeatsCategory]
glbLvlCategory <- merge(glbLvlCategory,
myget_category_stats(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id,
label="OOB"),
#by=glbFeatsCategory, all=TRUE) glb_ctgry-df already contains .n.OOB ?
all = TRUE)
row.names(glbLvlCategory) <- glbLvlCategory[, glbFeatsCategory]
if (any(grepl("OOB", glbMdlMetricsEval)))
print(orderBy(~-err.abs.OOB.mean, glbLvlCategory)) else
print(orderBy(~-err.abs.fit.mean, glbLvlCategory))
print(colSums(glbLvlCategory[, -grep(glbFeatsCategory, names(glbLvlCategory))]))
}
write.csv(glbObsOOB[, c(glbFeatsId,
grep(glb_rsp_var, names(glbObsOOB), fixed=TRUE, value=TRUE))],
paste0(gsub(".", "_", paste0(glbOut$pfx, glb_sel_mdl_id), fixed=TRUE),
"_OOBobs.csv"), row.names=FALSE)
fit.models_2_chunk_df <-
myadd_chunk(NULL, "fit.models_2_bgn", label.minor = "teardown")
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
# if (sum(is.na(glbObsAll$D.P.http)) > 0)
# stop("fit.models_3: Why is this happening ?")
#stop(here"); glb2Sav()
sync_glb_obs_df <- function() {
# Merge or cbind ?
for (col in setdiff(names(glbObsFit), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "Fit", col] <<- glbObsFit[, col]
for (col in setdiff(names(glbObsFit), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "Fit", col] <<- glbObsFit[, col]
if (all(is.na(glbObsNew[, glb_rsp_var])))
for (col in setdiff(names(glbObsOOB), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "OOB", col] <<- glbObsOOB[, col]
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "OOB", col] <<- glbObsOOB[, col]
}
sync_glb_obs_df()
print(setdiff(names(glbObsNew), names(glbObsAll)))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord=TRUE)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=TRUE)
4.0: manage missing data#load(paste0(glb_inp_pfx, "dsk.RData"))
if (!is.null(glb_fin_mdl_id) && (glb_fin_mdl_id %in% names(glb_models_lst))) {
warning("Final model same as user selected model")
glb_fin_mdl <- glb_models_lst[[glb_fin_mdl_id]]
} else
# if (nrow(glbObsFit) + length(glbObsFitOutliers) == nrow(glbObsTrn))
if (!all(is.na(glbObsNew[, glb_rsp_var])))
{
warning("Final model same as glb_sel_mdl_id")
glb_fin_mdl_id <- paste0("Final.", glb_sel_mdl_id)
glb_fin_mdl <- glb_sel_mdl
glb_models_lst[[glb_fin_mdl_id]] <- glb_fin_mdl
} else {
if (grepl("RFE\\.X", names(glbMdlFamilies))) {
indep_vars <- myadjust_interaction_feats(subset(glb_feats_df,
!nzv & (exclude.as.feat != 1))[, "id"])
rfe_trn_results <-
myrun_rfe(glbObsTrn, indep_vars, glbRFESizes[["Final"]])
if (!isTRUE(all.equal(sort(predictors(rfe_trn_results)),
sort(predictors(rfe_fit_results))))) {
print("Diffs predictors(rfe_trn_results) vs. predictors(rfe_fit_results):")
print(setdiff(predictors(rfe_trn_results), predictors(rfe_fit_results)))
print("Diffs predictors(rfe_fit_results) vs. predictors(rfe_trn_results):")
print(setdiff(predictors(rfe_fit_results), predictors(rfe_trn_results)))
}
}
# }
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), imp > 5)
# Fit selected models on glbObsTrn
for (mdl_id in gsub(".prob", "",
gsub(mygetPredictIds(glb_rsp_var)$value, "", row.names(mdlimp_df), fixed = TRUE),
fixed = TRUE)) {
mdl_id_components <- unlist(strsplit(mdl_id, "[.]"))
mdlIdPfx <- paste0(c(head(mdl_id_components, -1), "Train"),
collapse = ".")
if (grepl("RFE\\.X\\.", mdlIdPfx))
mdlIndepVars <- myadjust_interaction_feats(myextract_actual_feats(
predictors(rfe_trn_results))) else
mdlIndepVars <- trim(unlist(
strsplit(glb_models_df[glb_models_df$id == mdl_id, "feats"], "[,]")))
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdlIdPfx,
type = glb_model_type, tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = tail(mdl_id_components, 1))),
indep_vars = mdlIndepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsTrn, OOB_df = NULL)
glbObsTrn <- glb_get_predictions(df = glbObsTrn,
mdl_id = tail(glb_models_df$id, 1),
rsp_var = glb_rsp_var,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
glbObsNew <- glb_get_predictions(df = glbObsNew,
mdl_id = tail(glb_models_df$id, 1),
rsp_var = glb_rsp_var,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
}
}
# "Final" model
if ((model_method <- glb_sel_mdl$method) == "custom")
# get actual method from the mdl_id
model_method <- tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), imp > 5)
if (glb_is_classification && glb_is_binomial)
indep_vars_vctr <- gsub("(.*)\\.(.*)\\.prob", "\\1\\.Train\\.\\2\\.prob",
row.names(mdlimp_df)) else
indep_vars_vctr <- gsub("(.*)\\.(.*)", "\\1\\.Train\\.\\2",
row.names(mdlimp_df))
} else
if (grepl("RFE.X", glb_sel_mdl_id, fixed = TRUE)) {
indep_vars_vctr <- myextract_actual_feats(predictors(rfe_trn_results))
} else indep_vars_vctr <-
trim(unlist(strsplit(glb_models_df[glb_models_df$id ==
glb_sel_mdl_id
, "feats"], "[,]")))
if (!is.null(glb_preproc_methods) &&
((match_pos <- regexpr(gsub(".", "\\.",
paste(glb_preproc_methods, collapse = "|"),
fixed = TRUE), glb_sel_mdl_id)) != -1))
ths_preProcess <- str_sub(glb_sel_mdl_id, match_pos,
match_pos + attr(match_pos, "match.length") - 1) else
ths_preProcess <- NULL
mdl_id_pfx <- ifelse(grepl("Ensemble", glb_sel_mdl_id),
"Final.Ensemble", "Final")
trnobs_df <- glbObsTrn
if (!is.null(glbObsTrnOutliers[[mdl_id_pfx]])) {
trnobs_df <- glbObsTrn[!(glbObsTrn[, glbFeatsId] %in% glbObsTrnOutliers[[mdl_id_pfx]]), ]
print(sprintf("Outliers removed: %d", nrow(glbObsTrn) - nrow(trnobs_df)))
print(setdiff(glbObsTrn[, glbFeatsId], trnobs_df[, glbFeatsId]))
}
# Force fitting of Final.glm to identify outliers
method_vctr <- unique(c(myparseMdlId(glb_sel_mdl_id)$alg, glbMdlFamilies[["Final"]]))
for (method in method_vctr) {
#source("caret_nominalTrainWorkflow.R")
# glmnet requires at least 2 indep vars
if ((length(indep_vars_vctr) == 1) && (method %in% "glmnet"))
next
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method,
train.preProcess = ths_preProcess)),
indep_vars = indep_vars_vctr, rsp_var = glb_rsp_var,
fit_df = trnobs_df, OOB_df = NULL)
}
if ((length(method_vctr) == 1) || (method != "glm")) {
glb_fin_mdl <- glb_models_lst[[length(glb_models_lst)]]
glb_fin_mdl_id <- glb_models_df[length(glb_models_lst), "id"]
}
}
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=FALSE)
#stop(here"); glb2Sav()
if (glb_is_classification && glb_is_binomial)
prob_threshold <- glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"] else
prob_threshold <- NULL
if (grepl("Ensemble", glb_fin_mdl_id)) {
# Get predictions for each model in ensemble; Outliers that have been moved to OOB might not have been predicted yet
mdlEnsembleComps <- unlist(str_split(subset(glb_models_df,
id == glb_fin_mdl_id)$feats, ","))
if (glb_is_classification && glb_is_binomial)
mdlEnsembleComps <- gsub("\\.prob$", "", mdlEnsembleComps)
mdlEnsembleComps <- gsub(paste0("^",
gsub(".", "\\.", mygetPredictIds(glb_rsp_var)$value, fixed = TRUE)),
"", mdlEnsembleComps)
for (mdl_id in mdlEnsembleComps) {
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
glbObsNew <- glb_get_predictions(df = glbObsNew, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
}
}
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = glb_fin_mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
glb_featsimp_df <- myget_feats_importance(mdl=glb_fin_mdl,
featsimp_df=glb_featsimp_df)
#glb_featsimp_df[, paste0(glb_fin_mdl_id, ".imp")] <- glb_featsimp_df$imp
print(glb_featsimp_df)
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glb_fin_mdl_id,
prob_threshold=glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glb_fin_mdl_id)
dsp_feats_vctr <- c(NULL)
for(var in grep(".imp", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
# print(glbObsTrn[glbObsTrn$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glbObsTrn), value=TRUE)])
print(setdiff(names(glbObsTrn), names(glbObsAll)))
for (col in setdiff(names(glbObsTrn), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.src == "Train", col] <- glbObsTrn[, col]
print(setdiff(names(glbObsFit), names(glbObsAll)))
print(setdiff(names(glbObsOOB), names(glbObsAll)))
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.lcn == "OOB", col] <- glbObsOOB[, col]
print(setdiff(names(glbObsNew), names(glbObsAll)))
#glb2Sav(); all.equal(savObsAll, glbObsAll); all.equal(sav_models_lst, glb_models_lst)
#load(file = paste0(glbOut$pfx, "dsk_knitr.RData"))
#cmpCols <- names(glbObsAll)[!grepl("\\.Final\\.", names(glbObsAll))]; all.equal(savObsAll[, cmpCols], glbObsAll[, cmpCols]); all.equal(savObsAll[, "H.P.http"], glbObsAll[, "H.P.http"]);
replay.petrisim(pn = glb_analytics_pn,
replay.trans = (glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord = TRUE)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc = TRUE)
4.0: manage missing dataNull Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor label_minor bgn
## 7 extract.features.image 3 2 2 86.484
## 1 import.data 1 0 0 11.547
## 8 extract.features.price 3 3 3 996.684
## 2 inspect.data 2 0 0 78.873
## 3 scrub.data 2 1 1 84.806
## 11 extract.features.end 3 6 6 1019.343
## 9 extract.features.text 3 4 4 1019.222
## 10 extract.features.string 3 5 5 1019.286
## 4 transform.data 2 2 2 86.384
## 6 extract.features.datetime 3 1 1 86.448
## 5 extract.features 3 0 0 86.427
## end elapsed duration
## 7 996.684 910.200 910.200
## 1 78.872 67.325 67.325
## 8 1019.221 22.537 22.537
## 2 84.806 5.933 5.933
## 3 86.384 1.578 1.578
## 11 1020.226 0.883 0.883
## 9 1019.286 0.064 0.064
## 10 1019.342 0.056 0.056
## 4 86.426 0.042 0.042
## 6 86.484 0.036 0.036
## 5 86.448 0.021 0.021
## [1] "Total Elapsed Time: 1,020.226 secs"
## label step_major step_minor
## 5 extract.features.image.Image.patch.search 5 0
## 2 extract.features.image.Image.bgn 2 0
## 4 extract.features.image.Image.patch.mean 4 0
## 3 extract.features.image.Image.display 3 0
## 1 extract.features.image.bgn 1 0
## label_minor bgn end elapsed duration
## 5 0 254.276 996.627 742.352 742.351
## 2 0 86.523 238.840 152.318 152.317
## 4 0 245.265 254.275 9.010 9.010
## 3 0 238.841 245.264 6.423 6.423
## 1 0 86.515 86.523 0.008 0.008
## [1] "Total Elapsed Time: 996.627 secs"